Merge branch 'android14-5.15' into arpi-5.15.92

This commit is contained in:
Peter Yoon
2023-02-27 20:11:53 +09:00
1820 changed files with 134442 additions and 22152 deletions

1
.gitignore vendored
View File

@@ -45,6 +45,7 @@
*.symversions *.symversions
*.tab.[ch] *.tab.[ch]
*.tar *.tar
*.usyms
*.xz *.xz
*.zst *.zst
Module.symvers Module.symvers

566
BUILD.bazel Normal file
View File

@@ -0,0 +1,566 @@
# SPDX-License-Identifier: GPL-2.0
# Copyright (C) 2021 The Android Open Source Project
load("//build/bazel_common_rules/dist:dist.bzl", "copy_to_dist_dir")
load("//build/kernel/kleaf:common_kernels.bzl", "define_common_kernels", "define_db845c")
load(
"//build/kernel/kleaf:kernel.bzl",
"ddk_headers",
"kernel_abi",
"kernel_build",
"kernel_images",
"kernel_modules_install",
"kernel_unstripped_modules_archive",
)
load(":modules.bzl", "COMMON_GKI_MODULES_LIST")
package(
default_visibility = [
"//visibility:public",
],
)
_aarch64_additional_kmi_symbol_lists = [
# keep sorted
"android/abi_gki_aarch64_db845c",
"android/abi_gki_aarch64_exynos",
"android/abi_gki_aarch64_pixel",
"android/abi_gki_aarch64_virtual_device",
]
define_common_kernels(target_configs = {
"kernel_aarch64": {
# TODO(b/188620248): re-enable trimming
"trim_nonlisted_kmi": False,
"kmi_symbol_list_strict_mode": False,
"module_implicit_outs": COMMON_GKI_MODULES_LIST,
"kmi_symbol_list": "android/abi_gki_aarch64",
"additional_kmi_symbol_lists": _aarch64_additional_kmi_symbol_lists,
},
"kernel_aarch64_16k": {
"module_implicit_outs": COMMON_GKI_MODULES_LIST,
},
"kernel_aarch64_debug": {
# TODO(b/188620248): re-enable trimming
"trim_nonlisted_kmi": False,
"kmi_symbol_list_strict_mode": False,
"module_implicit_outs": COMMON_GKI_MODULES_LIST,
"kmi_symbol_list": "android/abi_gki_aarch64",
"additional_kmi_symbol_lists": _aarch64_additional_kmi_symbol_lists,
},
"kernel_x86_64": {
"kmi_symbol_list_strict_mode": False,
"module_implicit_outs": COMMON_GKI_MODULES_LIST,
},
"kernel_x86_64_debug": {
"kmi_symbol_list_strict_mode": False,
"module_implicit_outs": COMMON_GKI_MODULES_LIST,
},
})
define_db845c(
name = "db845c",
outs = [
"arch/arm64/boot/dts/qcom/qrb5165-rb5.dtb",
"arch/arm64/boot/dts/qcom/sdm845-db845c.dtb",
],
define_abi_targets = True,
kmi_symbol_list = "//common:android/abi_gki_aarch64_db845c",
kmi_symbol_list_add_only = True,
module_outs = [
# keep sorted
"crypto/michael_mic.ko",
"drivers/base/regmap/regmap-sdw.ko",
"drivers/base/regmap/regmap-slimbus.ko",
"drivers/bus/mhi/core/mhi.ko",
"drivers/clk/qcom/clk-qcom.ko",
"drivers/clk/qcom/clk-rpmh.ko",
"drivers/clk/qcom/clk-spmi-pmic-div.ko",
"drivers/clk/qcom/dispcc-sdm845.ko",
"drivers/clk/qcom/dispcc-sm8250.ko",
"drivers/clk/qcom/gcc-sdm845.ko",
"drivers/clk/qcom/gcc-sm8250.ko",
"drivers/clk/qcom/gpucc-sdm845.ko",
"drivers/clk/qcom/gpucc-sm8250.ko",
"drivers/clk/qcom/lpass-gfm-sm8250.ko",
"drivers/clk/qcom/videocc-sdm845.ko",
"drivers/clk/qcom/videocc-sm8250.ko",
"drivers/cpufreq/qcom-cpufreq-hw.ko",
"drivers/dma-buf/heaps/system_heap.ko",
"drivers/dma/qcom/bam_dma.ko",
"drivers/extcon/extcon-usb-gpio.ko",
"drivers/firmware/qcom-scm.ko",
"drivers/gpio/gpio-wcd934x.ko",
"drivers/gpu/drm/bridge/display-connector.ko",
"drivers/gpu/drm/bridge/lontium-lt9611.ko",
"drivers/gpu/drm/bridge/lontium-lt9611uxc.ko",
"drivers/gpu/drm/msm/msm.ko",
"drivers/gpu/drm/scheduler/gpu-sched.ko",
"drivers/hwspinlock/qcom_hwspinlock.ko",
"drivers/i2c/busses/i2c-designware-core.ko",
"drivers/i2c/busses/i2c-designware-platform.ko",
"drivers/i2c/busses/i2c-qcom-geni.ko",
"drivers/i2c/busses/i2c-qup.ko",
"drivers/i2c/busses/i2c-rk3x.ko",
"drivers/i2c/i2c-dev.ko",
"drivers/i2c/i2c-mux.ko",
"drivers/i2c/muxes/i2c-mux-pca954x.ko",
"drivers/iio/adc/qcom-spmi-adc5.ko",
"drivers/iio/adc/qcom-vadc-common.ko",
"drivers/input/misc/pm8941-pwrkey.ko",
"drivers/interconnect/qcom/icc-bcm-voter.ko",
"drivers/interconnect/qcom/icc-osm-l3.ko",
"drivers/interconnect/qcom/icc-rpmh.ko",
"drivers/interconnect/qcom/qnoc-sdm845.ko",
"drivers/interconnect/qcom/qnoc-sm8250.ko",
"drivers/iommu/arm/arm-smmu/arm_smmu.ko",
"drivers/irqchip/qcom-pdc.ko",
"drivers/leds/led-class-multicolor.ko",
"drivers/mailbox/qcom-apcs-ipc-mailbox.ko",
"drivers/mailbox/qcom-ipcc.ko",
"drivers/mfd/qcom-spmi-pmic.ko",
"drivers/mfd/wcd934x.ko",
"drivers/misc/fastrpc.ko",
"drivers/mmc/host/cqhci.ko",
"drivers/mmc/host/sdhci-msm.ko",
"drivers/net/can/spi/mcp251xfd/mcp251xfd.ko",
"drivers/net/wireless/ath/ath.ko",
"drivers/net/wireless/ath/ath10k/ath10k_core.ko",
"drivers/net/wireless/ath/ath10k/ath10k_pci.ko",
"drivers/net/wireless/ath/ath10k/ath10k_snoc.ko",
"drivers/net/wireless/ath/ath11k/ath11k.ko",
"drivers/net/wireless/ath/ath11k/ath11k_ahb.ko",
"drivers/net/wireless/ath/ath11k/ath11k_pci.ko",
"drivers/nvmem/nvmem_qfprom.ko",
"drivers/phy/qualcomm/phy-qcom-qmp.ko",
"drivers/phy/qualcomm/phy-qcom-qusb2.ko",
"drivers/phy/qualcomm/phy-qcom-snps-femto-v2.ko",
"drivers/phy/qualcomm/phy-qcom-usb-hs.ko",
"drivers/pinctrl/qcom/pinctrl-lpass-lpi.ko",
"drivers/pinctrl/qcom/pinctrl-msm.ko",
"drivers/pinctrl/qcom/pinctrl-sdm845.ko",
"drivers/pinctrl/qcom/pinctrl-sm8250.ko",
"drivers/pinctrl/qcom/pinctrl-spmi-gpio.ko",
"drivers/pinctrl/qcom/pinctrl-spmi-mpp.ko",
"drivers/power/reset/qcom-pon.ko",
"drivers/power/reset/reboot-mode.ko",
"drivers/power/reset/syscon-reboot-mode.ko",
"drivers/regulator/gpio-regulator.ko",
"drivers/regulator/qcom-rpmh-regulator.ko",
"drivers/regulator/qcom_spmi-regulator.ko",
"drivers/regulator/qcom_usb_vbus-regulator.ko",
"drivers/remoteproc/qcom_common.ko",
"drivers/remoteproc/qcom_pil_info.ko",
"drivers/remoteproc/qcom_q6v5.ko",
"drivers/remoteproc/qcom_q6v5_adsp.ko",
"drivers/remoteproc/qcom_q6v5_mss.ko",
"drivers/remoteproc/qcom_q6v5_pas.ko",
"drivers/remoteproc/qcom_q6v5_wcss.ko",
"drivers/remoteproc/qcom_sysmon.ko",
"drivers/reset/reset-qcom-aoss.ko",
"drivers/reset/reset-qcom-pdc.ko",
"drivers/rpmsg/qcom_glink.ko",
"drivers/rpmsg/qcom_glink_rpm.ko",
"drivers/rpmsg/qcom_glink_smem.ko",
"drivers/rpmsg/qcom_smd.ko",
"drivers/rpmsg/rpmsg_ns.ko",
"drivers/rtc/rtc-pm8xxx.ko",
"drivers/slimbus/slim-qcom-ngd-ctrl.ko",
"drivers/slimbus/slimbus.ko",
"drivers/soc/qcom/apr.ko",
"drivers/soc/qcom/cmd-db.ko",
"drivers/soc/qcom/llcc-qcom.ko",
"drivers/soc/qcom/mdt_loader.ko",
"drivers/soc/qcom/pdr_interface.ko",
"drivers/soc/qcom/qcom_aoss.ko",
"drivers/soc/qcom/qcom_rpmh.ko",
"drivers/soc/qcom/qmi_helpers.ko",
"drivers/soc/qcom/rmtfs_mem.ko",
"drivers/soc/qcom/rpmhpd.ko",
"drivers/soc/qcom/smem.ko",
"drivers/soc/qcom/smp2p.ko",
"drivers/soc/qcom/smsm.ko",
"drivers/soc/qcom/socinfo.ko",
"drivers/soundwire/soundwire-bus.ko",
"drivers/soundwire/soundwire-qcom.ko",
"drivers/spi/spi-geni-qcom.ko",
"drivers/spi/spi-pl022.ko",
"drivers/spi/spi-qcom-qspi.ko",
"drivers/spi/spi-qup.ko",
"drivers/spmi/spmi-pmic-arb.ko",
"drivers/thermal/qcom/lmh.ko",
"drivers/thermal/qcom/qcom-spmi-adc-tm5.ko",
"drivers/thermal/qcom/qcom-spmi-temp-alarm.ko",
"drivers/thermal/qcom/qcom_tsens.ko",
"drivers/tty/serial/msm_serial.ko",
"drivers/ufs/host/ufs_qcom.ko",
"drivers/usb/common/ulpi.ko",
"drivers/usb/host/ohci-hcd.ko",
"drivers/usb/host/ohci-pci.ko",
"drivers/usb/host/ohci-platform.ko",
"drivers/usb/typec/qcom-pmic-typec.ko",
"drivers/watchdog/pm8916_wdt.ko",
"drivers/watchdog/qcom-wdt.ko",
"net/qrtr/ns.ko",
"net/qrtr/qrtr.ko",
"net/qrtr/qrtr-mhi.ko",
"net/qrtr/qrtr-smd.ko",
"net/qrtr/qrtr-tun.ko",
"sound/soc/codecs/snd-soc-dmic.ko",
"sound/soc/codecs/snd-soc-hdmi-codec.ko",
"sound/soc/codecs/snd-soc-lpass-va-macro.ko",
"sound/soc/codecs/snd-soc-lpass-wsa-macro.ko",
"sound/soc/codecs/snd-soc-max98927.ko",
"sound/soc/codecs/snd-soc-rl6231.ko",
"sound/soc/codecs/snd-soc-rt5663.ko",
"sound/soc/codecs/snd-soc-wcd-mbhc.ko",
"sound/soc/codecs/snd-soc-wcd9335.ko",
"sound/soc/codecs/snd-soc-wcd934x.ko",
"sound/soc/codecs/snd-soc-wsa881x.ko",
"sound/soc/qcom/qdsp6/q6adm.ko",
"sound/soc/qcom/qdsp6/q6afe.ko",
"sound/soc/qcom/qdsp6/q6afe-clocks.ko",
"sound/soc/qcom/qdsp6/q6afe-dai.ko",
"sound/soc/qcom/qdsp6/q6asm.ko",
"sound/soc/qcom/qdsp6/q6asm-dai.ko",
"sound/soc/qcom/qdsp6/q6core.ko",
"sound/soc/qcom/qdsp6/q6dsp-common.ko",
"sound/soc/qcom/qdsp6/q6routing.ko",
"sound/soc/qcom/snd-soc-qcom-common.ko",
"sound/soc/qcom/snd-soc-sdm845.ko",
"sound/soc/qcom/snd-soc-sm8250.ko",
],
)
# TODO(b/258259749): Convert rockpi4 to mixed build
kernel_build(
name = "rockpi4",
outs = [
"Image",
"System.map",
"modules.builtin",
"modules.builtin.modinfo",
"rk3399-rock-pi-4b.dtb",
"vmlinux",
"vmlinux.symvers",
],
build_config = "build.config.rockpi4",
collect_unstripped_modules = True,
kmi_symbol_list = "//common:android/abi_gki_rockpi4",
module_outs = COMMON_GKI_MODULES_LIST + [
# keep sorted
"drivers/block/virtio_blk.ko",
"drivers/char/hw_random/virtio-rng.ko",
"drivers/clk/clk-rk808.ko",
"drivers/cpufreq/cpufreq-dt.ko",
"drivers/dma/pl330.ko",
"drivers/gpu/drm/bridge/analogix/analogix_dp.ko",
"drivers/gpu/drm/bridge/synopsys/dw-hdmi.ko",
"drivers/gpu/drm/bridge/synopsys/dw-mipi-dsi.ko",
"drivers/gpu/drm/rockchip/rockchipdrm.ko",
"drivers/i2c/busses/i2c-rk3x.ko",
"drivers/iio/adc/rockchip_saradc.ko",
"drivers/iio/buffer/industrialio-triggered-buffer.ko",
"drivers/iio/buffer/kfifo_buf.ko",
"drivers/mfd/rk808.ko",
"drivers/mmc/core/pwrseq_simple.ko",
"drivers/mmc/host/cqhci.ko",
"drivers/mmc/host/dw_mmc.ko",
"drivers/mmc/host/dw_mmc-pltfm.ko",
"drivers/mmc/host/dw_mmc-rockchip.ko",
"drivers/mmc/host/sdhci-of-arasan.ko",
"drivers/net/ethernet/stmicro/stmmac/dwmac-rk.ko",
"drivers/net/ethernet/stmicro/stmmac/stmmac.ko",
"drivers/net/ethernet/stmicro/stmmac/stmmac-platform.ko",
"drivers/net/net_failover.ko",
"drivers/net/pcs/pcs_xpcs.ko",
"drivers/net/virtio_net.ko",
"drivers/nvmem/nvmem_rockchip_efuse.ko",
"drivers/pci/controller/pcie-rockchip-host.ko",
"drivers/phy/rockchip/phy-rockchip-emmc.ko",
"drivers/phy/rockchip/phy-rockchip-inno-usb2.ko",
"drivers/phy/rockchip/phy-rockchip-pcie.ko",
"drivers/phy/rockchip/phy-rockchip-typec.ko",
"drivers/pwm/pwm-rockchip.ko",
"drivers/regulator/fan53555.ko",
"drivers/regulator/pwm-regulator.ko",
"drivers/regulator/rk808-regulator.ko",
"drivers/rtc/rtc-rk808.ko",
"drivers/soc/rockchip/io-domain.ko",
"drivers/thermal/rockchip_thermal.ko",
"drivers/usb/host/ohci-hcd.ko",
"drivers/usb/host/ohci-platform.ko",
"drivers/virtio/virtio_pci.ko",
"drivers/virtio/virtio_pci_modern_dev.ko",
"drivers/watchdog/dw_wdt.ko",
"net/core/failover.ko",
],
)
kernel_abi(
name = "rockpi4_abi",
kernel_build = "//common:rockpi4",
kmi_symbol_list_add_only = True,
)
kernel_modules_install(
name = "rockpi4_modules_install",
kernel_build = "//common:rockpi4",
)
kernel_unstripped_modules_archive(
name = "rockpi4_unstripped_modules_archive",
kernel_build = ":rockpi4",
)
kernel_images(
name = "rockpi4_images",
build_initramfs = True,
kernel_build = "//common:rockpi4",
kernel_modules_install = "//common:rockpi4_modules_install",
)
copy_to_dist_dir(
name = "rockpi4_dist",
data = [
":rockpi4",
":rockpi4_images",
":rockpi4_modules_install",
":rockpi4_unstripped_modules_archive",
],
dist_dir = "out/rockpi4/dist",
flat = True,
)
kernel_build(
name = "fips140",
outs = [],
base_kernel = ":kernel_aarch64",
build_config = "build.config.gki.aarch64.fips140",
module_outs = ["crypto/fips140.ko"],
)
copy_to_dist_dir(
name = "fips140_dist",
data = [
":fips140",
],
dist_dir = "out/fips140/dist",
flat = True,
)
# allmodconfig build tests.
# These are build tests only, so:
# - outs are intentionally set to empty to not copy anything to DIST_DIR
# - --allow-undeclared-modules must be used so modules are not declared or copied.
# - No dist target because these are build tests. We don't care about the artifacts.
# tools/bazel build --allow_undeclared_modules //common:kernel_aarch64_allmodconfig
kernel_build(
name = "kernel_aarch64_allmodconfig",
# Hack to actually check the build.
# Otherwise, Bazel thinks that there are no output files, and skip building.
outs = [".config"],
build_config = "build.config.allmodconfig.aarch64",
visibility = ["//visibility:private"],
)
# tools/bazel build --allow_undeclared_modules //common:kernel_x86_64_allmodconfig
kernel_build(
name = "kernel_x86_64_allmodconfig",
# Hack to actually check the build.
# Otherwise, Bazel thinks that there are no output files, and skip building.
outs = [".config"],
build_config = "build.config.allmodconfig.x86_64",
visibility = ["//visibility:private"],
)
# tools/bazel build --allow_undeclared_modules //common:kernel_arm_allmodconfig
kernel_build(
name = "kernel_arm_allmodconfig",
# Hack to actually check the build.
# Otherwise, Bazel thinks that there are no output files, and skip building.
outs = [".config"],
build_config = "build.config.allmodconfig.arm",
visibility = ["//visibility:private"],
)
# DDK Headers
# All headers. These are the public targets for DDK modules to use.
alias(
name = "all_headers",
actual = "all_headers_aarch64",
visibility = ["//visibility:public"],
)
ddk_headers(
name = "all_headers_aarch64",
hdrs = [":all_headers_allowlist_aarch64"] + select({
"//build/kernel/kleaf:allow_ddk_unsafe_headers_set": [":all_headers_unsafe"],
"//conditions:default": [],
}),
visibility = ["//visibility:public"],
)
ddk_headers(
name = "all_headers_arm",
hdrs = [":all_headers_allowlist_arm"] + select({
"//build/kernel/kleaf:allow_ddk_unsafe_headers_set": [":all_headers_unsafe"],
"//conditions:default": [],
}),
visibility = ["//visibility:public"],
)
ddk_headers(
name = "all_headers_x86_64",
hdrs = [":all_headers_allowlist_x86_64"] + select({
"//build/kernel/kleaf:allow_ddk_unsafe_headers_set": [":all_headers_unsafe"],
"//conditions:default": [],
}),
visibility = ["//visibility:public"],
)
# Implementation details for DDK headers. The targets below cannot be directly
# depended on by DDK modules.
# DDK headers allowlist. This is the list of all headers and include
# directories that are safe to use in DDK modules.
ddk_headers(
name = "all_headers_allowlist_aarch64",
hdrs = [
":all_headers_allowlist_aarch64_globs",
":all_headers_allowlist_common_globs",
],
# The list of include directories where source files can #include headers
# from. In other words, these are the `-I` option to the C compiler.
# These are prepended to LINUXINCLUDE.
linux_includes = [
"arch/arm64/include",
"arch/arm64/include/uapi",
"include",
"include/uapi",
],
visibility = ["//visibility:private"],
)
ddk_headers(
name = "all_headers_allowlist_arm",
hdrs = [
":all_headers_allowlist_arm_globs",
":all_headers_allowlist_common_globs",
],
# The list of include directories where source files can #include headers
# from. In other words, these are the `-I` option to the C compiler.
# These are prepended to LINUXINCLUDE.
linux_includes = [
"arch/arm/include",
"arch/arm/include/uapi",
"include",
"include/uapi",
],
visibility = ["//visibility:private"],
)
ddk_headers(
name = "all_headers_allowlist_x86_64",
hdrs = [
":all_headers_allowlist_common_globs",
":all_headers_allowlist_x86_64_globs",
],
# The list of include directories where source files can #include headers
# from. In other words, these are the `-I` option to the C compiler.
# These are prepended to LINUXINCLUDE.
linux_includes = [
"arch/x86/include",
"arch/x86/include/uapi",
"include",
"include/uapi",
],
visibility = ["//visibility:private"],
)
# List of DDK headers allowlist that are glob()-ed to avoid changes of BUILD
# file when the list of files changes. All headers in these directories
# are safe to use.
# These are separate filegroup targets so the all_headers_allowlist_* are
# more friendly to batch BUILD file update tools like buildozer.
# globs() for arm only
filegroup(
name = "all_headers_allowlist_arm_globs",
srcs = glob(["arch/arm/include/**/*.h"]),
visibility = ["//visibility:private"],
)
# globs() for arm64 only
filegroup(
name = "all_headers_allowlist_aarch64_globs",
srcs = glob(["arch/arm64/include/**/*.h"]),
visibility = ["//visibility:private"],
)
# globs() for x86 only
filegroup(
name = "all_headers_allowlist_x86_64_globs",
srcs = glob(["arch/x86/include/**/*.h"]),
visibility = ["//visibility:private"],
)
# globs() for all architectures
filegroup(
name = "all_headers_allowlist_common_globs",
srcs = glob(["include/**/*.h"]),
visibility = ["//visibility:private"],
)
# DDK headers unsafe list. This is the list of all headers and include
# directories that may be used during migration from kernel_module's, but
# should be avoided in general.
# Use with caution; items may:
# - be removed without notice
# - be moved into all_headers
ddk_headers(
name = "all_headers_unsafe",
hdrs = [
"drivers/devfreq/governor.h",
"drivers/dma-buf/heaps/deferred-free-helper.h",
"drivers/dma-buf/heaps/page_pool.h",
"drivers/dma/dmaengine.h",
"drivers/pci/controller/dwc/pcie-designware.h",
"drivers/pinctrl/core.h",
"drivers/pinctrl/samsung/pinctrl-samsung.h",
"drivers/staging/android/debug_kinfo.h",
"drivers/thermal/thermal_core.h",
"drivers/thermal/thermal_netlink.h",
"drivers/usb/core/phy.h",
"drivers/usb/dwc3/core.h",
"drivers/usb/dwc3/debug.h",
"drivers/usb/dwc3/gadget.h",
"drivers/usb/dwc3/io.h",
"drivers/usb/dwc3/trace.h",
"drivers/usb/gadget/configfs.h",
"drivers/usb/gadget/function/u_serial.h",
"drivers/usb/host/pci-quirks.h",
"drivers/usb/host/xhci.h",
"drivers/usb/host/xhci-ext-caps.h",
"drivers/usb/host/xhci-mvebu.h",
"drivers/usb/host/xhci-plat.h",
"drivers/usb/host/xhci-rcar.h",
"drivers/usb/typec/tcpm/tcpci.h",
],
# The list of include directories where source files can #include headers
# from. In other words, these are the `-I` option to the C compiler.
# Unsafe include directories are appended to ccflags-y.
includes = [
"drivers/devfreq",
"drivers/dma",
"drivers/dma-buf",
"drivers/pci/controller/dwc",
"drivers/pinctrl",
"drivers/scsi/ufs",
"drivers/thermal",
"drivers/usb",
"drivers/usb/gadget/function",
"drivers/usb/typec",
],
visibility = ["//visibility:private"],
)

View File

@@ -32,3 +32,21 @@ Description:
Note: If the module is built into the kernel, or if the Note: If the module is built into the kernel, or if the
CONFIG_MODULE_UNLOAD kernel configuration value is not enabled, CONFIG_MODULE_UNLOAD kernel configuration value is not enabled,
this file will not be present. this file will not be present.
What: /sys/module/MODULENAME/scmversion
Date: November 2020
KernelVersion: Android Common Kernel -- android12-5.10+
Contact: Will McVicker <willmcvicker@google.com>
Description: This read-only file will appear if modpost was supplied with an
SCM version for the module. It can be enabled with the config
MODULE_SCMVERSION. The SCM version is retrieved by
scripts/setlocalversion, which means that the presence of this
file depends on CONFIG_LOCALVERSION_AUTO=y. When read, the SCM
version that the module was compiled with is returned. The SCM
version is returned in the following format::
===
Git: g[a-f0-9]\+(-dirty)\?
Mercurial: hg[a-f0-9]\+(-dirty)\?
Subversion: svn[0-9]\+
===

View File

@@ -0,0 +1 @@
per-file sysfs-fs-f2fs=file:/fs/f2fs/OWNERS

View File

@@ -7,6 +7,7 @@ Description: UVC function directory
streaming_maxburst 0..15 (ss only) streaming_maxburst 0..15 (ss only)
streaming_maxpacket 1..1023 (fs), 1..3072 (hs/ss) streaming_maxpacket 1..1023 (fs), 1..3072 (hs/ss)
streaming_interval 1..16 streaming_interval 1..16
function_name string [32]
=================== ============================= =================== =============================
What: /config/usb-gadget/gadget/functions/uvc.name/control What: /config/usb-gadget/gadget/functions/uvc.name/control
@@ -196,7 +197,7 @@ Description: Specific MJPEG format descriptors
read-only read-only
bmaControls this format's data for bmaControls in bmaControls this format's data for bmaControls in
the streaming header the streaming header
bmInterfaceFlags specifies interlace information, bmInterlaceFlags specifies interlace information,
read-only read-only
bAspectRatioY the X dimension of the picture aspect bAspectRatioY the X dimension of the picture aspect
ratio, read-only ratio, read-only
@@ -252,7 +253,7 @@ Description: Specific uncompressed format descriptors
read-only read-only
bmaControls this format's data for bmaControls in bmaControls this format's data for bmaControls in
the streaming header the streaming header
bmInterfaceFlags specifies interlace information, bmInterlaceFlags specifies interlace information,
read-only read-only
bAspectRatioY the X dimension of the picture aspect bAspectRatioY the X dimension of the picture aspect
ratio, read-only ratio, read-only

View File

@@ -0,0 +1,19 @@
What: /sys/block/dm-<num>/bow/free
Date: January 2023
KernelVersion: 5.15
Contact: paullawrence@google.com
Description: free space
Free space on device in bytes. Only valid in state 0
Users: Android vold to determine if there is sufficient space for expected size
of checksum
What: /sys/block/dm-<num>/bow/state
Date: January 2023
KernelVersion: 5.15
Contact: paullawrence@google.com
Description: dm-bow state
Read-write string containing 0, 1 or 2
0: Trim mode
1: Checkpoint mode
2: Committed mode
See Documentation/device-mapper/dm-bow for details

View File

@@ -47,3 +47,18 @@ Description:
USB SuperSpeed protocol. From user perspective pin assignments C USB SuperSpeed protocol. From user perspective pin assignments C
and E are equal, where all channels on the connector are used and E are equal, where all channels on the connector are used
for carrying DisplayPort protocol (allowing higher resolutions). for carrying DisplayPort protocol (allowing higher resolutions).
What: /sys/bus/typec/devices/.../displayport/hpd
Date: Dec 2022
Contact: Badhri Jagan Sridharan <badhri@google.com>
Description:
VESA DisplayPort Alt Mode on USB Type-C Standard defines how
HotPlugDetect(HPD) shall be supported on the USB-C connector when
operating in DisplayPort Alt Mode. This is a read only node which
reflects the current state of HPD.
Valid values:
- 1: when HPDs logical state is high (HPD_High) as defined
by VESA DisplayPort Alt Mode on USB Type-C Standard.
- 0 when HPDs logical state is low (HPD_Low) as defined by
VESA DisplayPort Alt Mode on USB Type-C Standard.

View File

@@ -1299,6 +1299,15 @@ Description: This node is used to set or display whether UFS WriteBooster is
platform that doesn't support UFSHCD_CAP_CLK_SCALING, we can platform that doesn't support UFSHCD_CAP_CLK_SCALING, we can
disable/enable WriteBooster through this sysfs node. disable/enable WriteBooster through this sysfs node.
What: /sys/bus/platform/drivers/ufshcd/*/enable_wb_buf_flush
What: /sys/bus/platform/devices/*.ufs/enable_wb_buf_flush
Date: July 2022
Contact: Jinyoung Choi <j-young.choi@samsung.com>
Description: This entry shows the status of WriteBooster buffer flushing
and it can be used to enable or disable the flushing.
If flushing is enabled, the device executes the flush
operation when the command queue is empty.
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/hpb_version What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/hpb_version
Date: June 2021 Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com> Contact: Daejun Park <daejun7.park@samsung.com>
@@ -1394,7 +1403,7 @@ Description: This entry shows the number of reads that cannot be changed to
The file is read only. The file is read only.
What: /sys/class/scsi_device/*/device/hpb_stats/rb_noti_cnt What: /sys/class/scsi_device/*/device/hpb_stats/rcmd_noti_cnt
Date: June 2021 Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com> Contact: Daejun Park <daejun7.park@samsung.com>
Description: This entry shows the number of response UPIUs that has Description: This entry shows the number of response UPIUs that has
@@ -1402,19 +1411,23 @@ Description: This entry shows the number of response UPIUs that has
The file is read only. The file is read only.
What: /sys/class/scsi_device/*/device/hpb_stats/rb_active_cnt What: /sys/class/scsi_device/*/device/hpb_stats/rcmd_active_cnt
Date: June 2021 Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com> Contact: Daejun Park <daejun7.park@samsung.com>
Description: This entry shows the number of active sub-regions recommended by Description: For the HPB device control mode, this entry shows the number of
response UPIUs. active sub-regions recommended by response UPIUs. For the HPB host control
mode, this entry shows the number of active sub-regions recommended by the
HPB host control mode heuristic algorithm.
The file is read only. The file is read only.
What: /sys/class/scsi_device/*/device/hpb_stats/rb_inactive_cnt What: /sys/class/scsi_device/*/device/hpb_stats/rcmd_inactive_cnt
Date: June 2021 Date: June 2021
Contact: Daejun Park <daejun7.park@samsung.com> Contact: Daejun Park <daejun7.park@samsung.com>
Description: This entry shows the number of inactive regions recommended by Description: For the HPB device control mode, this entry shows the number of
response UPIUs. inactive regions recommended by response UPIUs. For the HPB host control
mode, this entry shows the number of inactive regions recommended by the
HPB host control mode heuristic algorithm.
The file is read only. The file is read only.
@@ -1461,6 +1474,43 @@ Description: This entry shows the status of HPB.
The file is read only. The file is read only.
Contact: Daniil Lunev <dlunev@chromium.org>
What: /sys/bus/platform/drivers/ufshcd/*/capabilities/
What: /sys/bus/platform/devices/*.ufs/capabilities/
Date: August 2022
Description: The group represents the effective capabilities of the
host-device pair. i.e. the capabilities which are enabled in the
driver for the specific host controller, supported by the host
controller and are supported and/or have compatible
configuration on the device side.
Contact: Daniil Lunev <dlunev@chromium.org>
What: /sys/bus/platform/drivers/ufshcd/*/capabilities/clock_scaling
What: /sys/bus/platform/devices/*.ufs/capabilities/clock_scaling
Date: August 2022
Contact: Daniil Lunev <dlunev@chromium.org>
Description: Indicates status of clock scaling.
== ============================
0 Clock scaling is not supported.
1 Clock scaling is supported.
== ============================
The file is read only.
What: /sys/bus/platform/drivers/ufshcd/*/capabilities/write_booster
What: /sys/bus/platform/devices/*.ufs/capabilities/write_booster
Date: August 2022
Contact: Daniil Lunev <dlunev@chromium.org>
Description: Indicates status of Write Booster.
== ============================
0 Write Booster can not be enabled.
1 Write Booster can be enabled.
== ============================
The file is read only.
What: /sys/class/scsi_device/*/device/hpb_param_sysfs/activation_thld What: /sys/class/scsi_device/*/device/hpb_param_sysfs/activation_thld
Date: February 2021 Date: February 2021
Contact: Avri Altman <avri.altman@wdc.com> Contact: Avri Altman <avri.altman@wdc.com>

View File

@@ -0,0 +1,7 @@
What: /sys/fs/erofs/features/
Date: November 2021
Contact: "Huang Jianan" <huangjianan@oppo.com>
Description: Shows all enabled kernel features.
Supported features:
zero_padding, compr_cfgs, big_pcluster, chunked_file,
device_table, compr_head2, sb_chksum.

View File

@@ -55,8 +55,9 @@ Description: Controls the in-place-update policy.
0x04 F2FS_IPU_UTIL 0x04 F2FS_IPU_UTIL
0x08 F2FS_IPU_SSR_UTIL 0x08 F2FS_IPU_SSR_UTIL
0x10 F2FS_IPU_FSYNC 0x10 F2FS_IPU_FSYNC
0x20 F2FS_IPU_ASYNC, 0x20 F2FS_IPU_ASYNC
0x40 F2FS_IPU_NOCACHE 0x40 F2FS_IPU_NOCACHE
0x80 F2FS_IPU_HONOR_OPU_WRITE
==== ================= ==== =================
Refer segment.h for details. Refer segment.h for details.
@@ -98,13 +99,47 @@ Description: Controls the issue rate of discard commands that consist of small
checkpoint is triggered, and issued during the checkpoint. checkpoint is triggered, and issued during the checkpoint.
By default, it is disabled with 0. By default, it is disabled with 0.
What: /sys/fs/f2fs/<disk>/max_ordered_discard
Date: October 2022
Contact: "Yangtao Li" <frank.li@vivo.com>
Description: Controls the maximum ordered discard, the unit size is one block(4KB).
Set it to 16 by default.
What: /sys/fs/f2fs/<disk>/max_discard_request
Date: December 2021
Contact: "Konstantin Vyshetsky" <vkon@google.com>
Description: Controls the number of discards a thread will issue at a time.
Higher number will allow the discard thread to finish its work
faster, at the cost of higher latency for incomming I/O.
What: /sys/fs/f2fs/<disk>/min_discard_issue_time
Date: December 2021
Contact: "Konstantin Vyshetsky" <vkon@google.com>
Description: Controls the interval the discard thread will wait between
issuing discard requests when there are discards to be issued and
no I/O aware interruptions occur.
What: /sys/fs/f2fs/<disk>/mid_discard_issue_time
Date: December 2021
Contact: "Konstantin Vyshetsky" <vkon@google.com>
Description: Controls the interval the discard thread will wait between
issuing discard requests when there are discards to be issued and
an I/O aware interruption occurs.
What: /sys/fs/f2fs/<disk>/max_discard_issue_time
Date: December 2021
Contact: "Konstantin Vyshetsky" <vkon@google.com>
Description: Controls the interval the discard thread will wait when there are
no discard operations to be issued.
What: /sys/fs/f2fs/<disk>/discard_granularity What: /sys/fs/f2fs/<disk>/discard_granularity
Date: July 2017 Date: July 2017
Contact: "Chao Yu" <yuchao0@huawei.com> Contact: "Chao Yu" <yuchao0@huawei.com>
Description: Controls discard granularity of inner discard thread. Inner thread Description: Controls discard granularity of inner discard thread. Inner thread
will not issue discards with size that is smaller than granularity. will not issue discards with size that is smaller than granularity.
The unit size is one block(4KB), now only support configuring The unit size is one block(4KB), now only support configuring
in range of [1, 512]. Default value is 4(=16KB). in range of [1, 512]. Default value is 16.
For small devices, default value is 1.
What: /sys/fs/f2fs/<disk>/umount_discard_timeout What: /sys/fs/f2fs/<disk>/umount_discard_timeout
Date: January 2019 Date: January 2019
@@ -112,6 +147,11 @@ Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Set timeout to issue discard commands during umount. Description: Set timeout to issue discard commands during umount.
Default: 5 secs Default: 5 secs
What: /sys/fs/f2fs/<disk>/pending_discard
Date: November 2021
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Shows the number of pending discard commands in the queue.
What: /sys/fs/f2fs/<disk>/max_victim_search What: /sys/fs/f2fs/<disk>/max_victim_search
Date: January 2014 Date: January 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
@@ -202,7 +242,7 @@ Description: Shows total written kbytes issued to disk.
What: /sys/fs/f2fs/<disk>/features What: /sys/fs/f2fs/<disk>/features
Date: July 2017 Date: July 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: <deprecated: should use /sys/fs/f2fs/<disk>/feature_list/ Description: <deprecated: should use /sys/fs/f2fs/<disk>/feature_list/>
Shows all enabled features in current device. Shows all enabled features in current device.
Supported features: Supported features:
encryption, blkzoned, extra_attr, projquota, inode_checksum, encryption, blkzoned, extra_attr, projquota, inode_checksum,
@@ -264,11 +304,16 @@ Description: Shows current reserved blocks in system, it may be temporarily
What: /sys/fs/f2fs/<disk>/gc_urgent What: /sys/fs/f2fs/<disk>/gc_urgent
Date: August 2017 Date: August 2017
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Do background GC aggressively when set. When gc_urgent = 1, Description: Do background GC aggressively when set. Set to 0 by default.
background thread starts to do GC by given gc_urgent_sleep_time gc urgent high(1): does GC forcibly in a period of given
interval. When gc_urgent = 2, F2FS will lower the bar of gc_urgent_sleep_time and ignores I/O idling check. uses greedy
checking idle in order to process outstanding discard commands GC approach and turns SSR mode on.
and GC a little bit aggressively. It is set to 0 by default. gc urgent low(2): lowers the bar of checking I/O idling in
order to process outstanding discard commands and GC a
little bit aggressively. uses cost benefit GC approach.
gc urgent mid(3): does GC forcibly in a period of given
gc_urgent_sleep_time and executes a mid level of I/O idling check.
uses cost benefit GC approach.
What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
Date: August 2017 Date: August 2017
@@ -428,6 +473,30 @@ Description: Show status of f2fs superblock in real time.
0x4000 SBI_IS_FREEZING freefs is in process 0x4000 SBI_IS_FREEZING freefs is in process
====== ===================== ================================= ====== ===================== =================================
What: /sys/fs/f2fs/<disk>/stat/cp_status
Date: September 2022
Contact: "Chao Yu" <chao.yu@oppo.com>
Description: Show status of f2fs checkpoint in real time.
=============================== ==============================
cp flag value
CP_UMOUNT_FLAG 0x00000001
CP_ORPHAN_PRESENT_FLAG 0x00000002
CP_COMPACT_SUM_FLAG 0x00000004
CP_ERROR_FLAG 0x00000008
CP_FSCK_FLAG 0x00000010
CP_FASTBOOT_FLAG 0x00000020
CP_CRC_RECOVERY_FLAG 0x00000040
CP_NAT_BITS_FLAG 0x00000080
CP_TRIMMED_FLAG 0x00000100
CP_NOCRC_RECOVERY_FLAG 0x00000200
CP_LARGE_NAT_BITMAP_FLAG 0x00000400
CP_QUOTA_NEED_FSCK_FLAG 0x00000800
CP_DISABLED_FLAG 0x00001000
CP_DISABLED_QUICK_FLAG 0x00002000
CP_RESIZEFS_FLAG 0x00004000
=============================== ==============================
What: /sys/fs/f2fs/<disk>/ckpt_thread_ioprio What: /sys/fs/f2fs/<disk>/ckpt_thread_ioprio
Date: January 2021 Date: January 2021
Contact: "Daeho Jeong" <daehojeong@google.com> Contact: "Daeho Jeong" <daehojeong@google.com>
@@ -499,7 +568,7 @@ Date: July 2021
Contact: "Daeho Jeong" <daehojeong@google.com> Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Show how many segments have been reclaimed by GC during a specific Description: Show how many segments have been reclaimed by GC during a specific
GC mode (0: GC normal, 1: GC idle CB, 2: GC idle greedy, GC mode (0: GC normal, 1: GC idle CB, 2: GC idle greedy,
3: GC idle AT, 4: GC urgent high, 5: GC urgent low) 3: GC idle AT, 4: GC urgent high, 5: GC urgent low 6: GC urgent mid)
You can re-initialize this value to "0". You can re-initialize this value to "0".
What: /sys/fs/f2fs/<disk>/gc_segment_mode What: /sys/fs/f2fs/<disk>/gc_segment_mode
@@ -513,3 +582,90 @@ Date: July 2021
Contact: "Daeho Jeong" <daehojeong@google.com> Contact: "Daeho Jeong" <daehojeong@google.com>
Description: You can control the multiplier value of bdi device readahead window size Description: You can control the multiplier value of bdi device readahead window size
between 2 (default) and 256 for POSIX_FADV_SEQUENTIAL advise option. between 2 (default) and 256 for POSIX_FADV_SEQUENTIAL advise option.
What: /sys/fs/f2fs/<disk>/max_fragment_chunk
Date: August 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: With "mode=fragment:block" mount options, we can scatter block allocation.
f2fs will allocate 1..<max_fragment_chunk> blocks in a chunk and make a hole
in the length of 1..<max_fragment_hole> by turns. This value can be set
between 1..512 and the default value is 4.
What: /sys/fs/f2fs/<disk>/max_fragment_hole
Date: August 2021
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: With "mode=fragment:block" mount options, we can scatter block allocation.
f2fs will allocate 1..<max_fragment_chunk> blocks in a chunk and make a hole
in the length of 1..<max_fragment_hole> by turns. This value can be set
between 1..512 and the default value is 4.
What: /sys/fs/f2fs/<disk>/gc_remaining_trials
Date: October 2022
Contact: "Yangtao Li" <frank.li@vivo.com>
Description: You can set the trial count limit for GC urgent and idle mode with this value.
If GC thread gets to the limit, the mode will turn back to GC normal mode.
By default, the value is zero, which means there is no limit like before.
What: /sys/fs/f2fs/<disk>/max_roll_forward_node_blocks
Date: January 2022
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Controls max # of node block writes to be used for roll forward
recovery. This can limit the roll forward recovery time.
What: /sys/fs/f2fs/<disk>/unusable_blocks_per_sec
Date: June 2022
Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
Description: Shows the number of unusable blocks in a section which was defined by
the zone capacity reported by underlying zoned device.
What: /sys/fs/f2fs/<disk>/current_atomic_write
Date: July 2022
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Show the total current atomic write block count, which is not committed yet.
This is a read-only entry.
What: /sys/fs/f2fs/<disk>/peak_atomic_write
Date: July 2022
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Show the peak value of total current atomic write block count after boot.
If you write "0" here, you can initialize to "0".
What: /sys/fs/f2fs/<disk>/committed_atomic_block
Date: July 2022
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Show the accumulated total committed atomic write block count after boot.
If you write "0" here, you can initialize to "0".
What: /sys/fs/f2fs/<disk>/revoked_atomic_block
Date: July 2022
Contact: "Daeho Jeong" <daehojeong@google.com>
Description: Show the accumulated total revoked atomic write block count after boot.
If you write "0" here, you can initialize to "0".
What: /sys/fs/f2fs/<disk>/gc_mode
Date: October 2022
Contact: "Yangtao Li" <frank.li@vivo.com>
Description: Show the current gc_mode as a string.
This is a read-only entry.
What: /sys/fs/f2fs/<disk>/discard_urgent_util
Date: November 2022
Contact: "Yangtao Li" <frank.li@vivo.com>
Description: When space utilization exceeds this, do background DISCARD aggressively.
Does DISCARD forcibly in a period of given min_discard_issue_time when the number
of discards is not 0 and set discard granularity to 1.
Default: 80
What: /sys/fs/f2fs/<disk>/hot_data_age_threshold
Date: November 2022
Contact: "Ping Xiong" <xiongping1@xiaomi.com>
Description: When DATA SEPARATION is on, it controls the age threshold to indicate
the data blocks as hot. By default it was initialized as 262144 blocks
(equals to 1GB).
What: /sys/fs/f2fs/<disk>/warm_data_age_threshold
Date: November 2022
Contact: "Ping Xiong" <xiongping1@xiaomi.com>
Description: When DATA SEPARATION is on, it controls the age threshold to indicate
the data blocks as warm. By default it was initialized as 2621440 blocks
(equals to 10GB).

View File

@@ -0,0 +1,19 @@
What: /sys/fs/fuse/features/fuse_bpf
Date: December 2022
Contact: Paul Lawrence <paullawrence@google.com>
Description:
Read-only file that contains the word 'supported' if fuse-bpf is
supported, does not exist otherwise
What: /sys/fs/fuse/bpf_prog_type_fuse
Date: December 2022
Contact: Paul Lawrence <paullawrence@google.com>
Description:
bpf_prog_type_fuse defines the program type of bpf programs that
may be passed to fuse-bpf. For upstream bpf program types, this
is a constant defined in a contiguous array of constants.
bpf_prog_type_fuse is appended to the end of the list, so it may
change and therefore its value must be read from this file.
Contents is ASCII decimal representation of bpf_prog_type_fuse

View File

@@ -0,0 +1,70 @@
What: /sys/fs/incremental-fs/features/corefs
Date: 2019
Contact: Paul Lawrence <paullawrence@google.com>
Description: Reads 'supported'. Always present.
What: /sys/fs/incremental-fs/features/v2
Date: April 2021
Contact: Paul Lawrence <paullawrence@google.com>
Description: Reads 'supported'. Present if all v2 features of incfs are
supported.
What: /sys/fs/incremental-fs/features/zstd
Date: April 2021
Contact: Paul Lawrence <paullawrence@google.com>
Description: Reads 'supported'. Present if zstd compression is supported
for data blocks.
What: /sys/fs/incremental-fs/features/bugfix_throttling
Date: January 2023
Contact: Paul Lawrence <paullawrence@google.com>
Description: Reads 'supported'. Present if the throttling lock bug is fixed
https://android-review.git.corp.google.com/c/kernel/common/+/2381827
What: /sys/fs/incremental-fs/instances/[name]
Date: April 2021
Contact: Paul Lawrence <paullawrence@google.com>
Description: Folder created when incfs is mounted with the sysfs_name=[name]
option. If this option is used, the following values are created
in this folder.
What: /sys/fs/incremental-fs/instances/[name]/reads_delayed_min
Date: April 2021
Contact: Paul Lawrence <paullawrence@google.com>
Description: Returns a count of the number of reads that were delayed as a
result of the per UID read timeouts min time setting.
What: /sys/fs/incremental-fs/instances/[name]/reads_delayed_min_us
Date: April 2021
Contact: Paul Lawrence <paullawrence@google.com>
Description: Returns total delay time for all files since first mount as a
result of the per UID read timeouts min time setting.
What: /sys/fs/incremental-fs/instances/[name]/reads_delayed_pending
Date: April 2021
Contact: Paul Lawrence <paullawrence@google.com>
Description: Returns a count of the number of reads that were delayed as a
result of waiting for a pending read.
What: /sys/fs/incremental-fs/instances/[name]/reads_delayed_pending_us
Date: April 2021
Contact: Paul Lawrence <paullawrence@google.com>
Description: Returns total delay time for all files since first mount as a
result of waiting for a pending read.
What: /sys/fs/incremental-fs/instances/[name]/reads_failed_hash_verification
Date: April 2021
Contact: Paul Lawrence <paullawrence@google.com>
Description: Returns number of reads that failed because of hash verification
failures.
What: /sys/fs/incremental-fs/instances/[name]/reads_failed_other
Date: April 2021
Contact: Paul Lawrence <paullawrence@google.com>
Description: Returns number of reads that failed for reasons other than
timing out or hash failures.
What: /sys/fs/incremental-fs/instances/[name]/reads_failed_timed_out
Date: April 2021
Contact: Paul Lawrence <paullawrence@google.com>
Description: Returns number of reads that timed out.

View File

@@ -0,0 +1,7 @@
What: /sys/kernel/dma_heap/total_pools_kb
Date: Feb 2021
KernelVersion: 5.10
Contact: Hridya Valsaraju <hridya@google.com>,
Description:
The total_pools_kb file is read-only and specifies how much
memory in Kb is allocated to DMA-BUF heap pools.

View File

@@ -0,0 +1,16 @@
What: /sys/kernel/wakeup_reasons/last_resume_reason
Date: February 2014
Contact: Ruchi Kandoi <kandoiruchi@google.com>
Description:
The /sys/kernel/wakeup_reasons/last_resume_reason is
used to report wakeup reasons after system exited suspend.
What: /sys/kernel/wakeup_reasons/last_suspend_time
Date: March 2015
Contact: jinqian <jinqian@google.com>
Description:
The /sys/kernel/wakeup_reasons/last_suspend_time is
used to report time spent in last suspend cycle. It contains
two numbers (in seconds) separated by space. First number is
the time spent in suspend and resume processes. Second number
is the time spent in sleep state.

View File

@@ -1296,6 +1296,11 @@ PAGE_SIZE multiple when read back.
pagetables pagetables
Amount of memory allocated for page tables. Amount of memory allocated for page tables.
sec_pagetables
Amount of memory allocated for secondary page tables,
this currently includes KVM mmu allocations on x86
and arm64.
percpu (npn) percpu (npn)
Amount of memory used for storing per-cpu kernel Amount of memory used for storing per-cpu kernel
data structures. data structures.

View File

@@ -961,6 +961,10 @@
can be useful when debugging issues that require an SLB can be useful when debugging issues that require an SLB
miss to occur. miss to occur.
disable_dma32= [KNL]
Dynamically disable ZONE_DMA32 on kernels compiled with
CONFIG_ZONE_DMA32=y.
stress_slb [PPC] stress_slb [PPC]
Limits the number of kernel SLB entries, and flushes Limits the number of kernel SLB entries, and flushes
them frequently to increase the rate of SLB faults them frequently to increase the rate of SLB faults
@@ -1396,6 +1400,10 @@
Format: { "fix" } Format: { "fix" }
Permit 'security.evm' to be updated regardless of Permit 'security.evm' to be updated regardless of
current integrity status. current integrity status.
export_pmu_events
[KNL,ARM64] Sets the PMU export bit (PMCR_EL0.X), which enables
the exporting of events over an IMPLEMENTATION DEFINED PMU event
export bus to another device.
failslab= failslab=
fail_usercopy= fail_usercopy=
@@ -1646,6 +1654,10 @@
If specified, z/VM IUCV HVC accepts connections If specified, z/VM IUCV HVC accepts connections
from listed z/VM user IDs only. from listed z/VM user IDs only.
hvc_dcc.enable= [ARM,ARM64] Enable DCC driver at runtime. For GKI,
disabled at runtime by default to prevent
crashes in devices which do not support DCC.
hv_nopvspin [X86,HYPER_V] Disables the paravirt spinlock optimizations hv_nopvspin [X86,HYPER_V] Disables the paravirt spinlock optimizations
which allow the hypervisor to 'idle' the which allow the hypervisor to 'idle' the
guest on lock contention. guest on lock contention.
@@ -2056,6 +2068,14 @@
forcing Dual Address Cycle for PCI cards supporting forcing Dual Address Cycle for PCI cards supporting
greater than 32-bit addressing. greater than 32-bit addressing.
iommu.max_align_shift=
[ARM64, X86] Limit the alignment of IOVAs to a maximum
PAGE_SIZE order. Larger IOVAs will be aligned to this
specified order. The order is expressed as a power of
two multiplied by the PAGE_SIZE.
Format: { "4" | "5" | "6" | "7" | "8" | "9" }
Default: 9
iommu.strict= [ARM64, X86] Configure TLB invalidation behaviour iommu.strict= [ARM64, X86] Configure TLB invalidation behaviour
Format: { "0" | "1" } Format: { "0" | "1" }
0 - Lazy mode. 0 - Lazy mode.
@@ -2078,6 +2098,9 @@
1 - Bypass the IOMMU for DMA. 1 - Bypass the IOMMU for DMA.
unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH. unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH.
ioremap_guard [ARM64] enable the KVM MMIO guard functionality
if available.
io7= [HW] IO7 for Marvel-based Alpha systems io7= [HW] IO7 for Marvel-based Alpha systems
See comment before marvel_specify_io7 in See comment before marvel_specify_io7 in
arch/alpha/kernel/core_marvel.c. arch/alpha/kernel/core_marvel.c.
@@ -2367,14 +2390,19 @@
kvm-arm.mode= kvm-arm.mode=
[KVM,ARM] Select one of KVM/arm64's modes of operation. [KVM,ARM] Select one of KVM/arm64's modes of operation.
none: Forcefully disable KVM.
nvhe: Standard nVHE-based mode, without support for nvhe: Standard nVHE-based mode, without support for
protected guests. protected guests.
protected: nVHE-based mode with support for guests whose protected: nVHE-based mode with support for guests whose
state is kept private from the host. state is kept private from the host. See
Not valid if the kernel is running in EL2. Documentation/virt/kvm/arm/pkvm.rst for more
information about this mode of operation.
Defaults to VHE/nVHE based on hardware support. Defaults to VHE/nVHE based on hardware support. Setting
mode to "protected" will disable kexec and hibernation
for the host.
kvm-arm.vgic_v3_group0_trap= kvm-arm.vgic_v3_group0_trap=
[KVM,ARM] Trap guest accesses to GICv3 group-0 [KVM,ARM] Trap guest accesses to GICv3 group-0

View File

@@ -32,6 +32,7 @@ the Linux memory management.
idle_page_tracking idle_page_tracking
ksm ksm
memory-hotplug memory-hotplug
multigen_lru
nommu-mmap nommu-mmap
numa_memory_policy numa_memory_policy
numaperf numaperf

View File

@@ -0,0 +1,162 @@
.. SPDX-License-Identifier: GPL-2.0
=============
Multi-Gen LRU
=============
The multi-gen LRU is an alternative LRU implementation that optimizes
page reclaim and improves performance under memory pressure. Page
reclaim decides the kernel's caching policy and ability to overcommit
memory. It directly impacts the kswapd CPU usage and RAM efficiency.
Quick start
===========
Build the kernel with the following configurations.
* ``CONFIG_LRU_GEN=y``
* ``CONFIG_LRU_GEN_ENABLED=y``
All set!
Runtime options
===============
``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the
following subsections.
Kill switch
-----------
``enabled`` accepts different values to enable or disable the
following components. Its default value depends on
``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled
unless some of them have unforeseen side effects. Writing to
``enabled`` has no effect when a component is not supported by the
hardware, and valid values will be accepted even when the main switch
is off.
====== ===============================================================
Values Components
====== ===============================================================
0x0001 The main switch for the multi-gen LRU.
0x0002 Clearing the accessed bit in leaf page table entries in large
batches, when MMU sets it (e.g., on x86). This behavior can
theoretically worsen lock contention (mmap_lock). If it is
disabled, the multi-gen LRU will suffer a minor performance
degradation for workloads that contiguously map hot pages,
whose accessed bits can be otherwise cleared by fewer larger
batches.
0x0004 Clearing the accessed bit in non-leaf page table entries as
well, when MMU sets it (e.g., on x86). This behavior was not
verified on x86 varieties other than Intel and AMD. If it is
disabled, the multi-gen LRU will suffer a negligible
performance degradation.
[yYnN] Apply to all the components above.
====== ===============================================================
E.g.,
::
echo y >/sys/kernel/mm/lru_gen/enabled
cat /sys/kernel/mm/lru_gen/enabled
0x0007
echo 5 >/sys/kernel/mm/lru_gen/enabled
cat /sys/kernel/mm/lru_gen/enabled
0x0005
Thrashing prevention
--------------------
Personal computers are more sensitive to thrashing because it can
cause janks (lags when rendering UI) and negatively impact user
experience. The multi-gen LRU offers thrashing prevention to the
majority of laptop and desktop users who do not have ``oomd``.
Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of
``N`` milliseconds from getting evicted. The OOM killer is triggered
if this working set cannot be kept in memory. In other words, this
option works as an adjustable pressure relief valve, and when open, it
terminates applications that are hopefully not being used.
Based on the average human detectable lag (~100ms), ``N=1000`` usually
eliminates intolerable janks due to thrashing. Larger values like
``N=3000`` make janks less noticeable at the risk of premature OOM
kills.
The default value ``0`` means disabled.
Experimental features
=====================
``/sys/kernel/debug/lru_gen`` accepts commands described in the
following subsections. Multiple command lines are supported, so does
concatenation with delimiters ``,`` and ``;``.
``/sys/kernel/debug/lru_gen_full`` provides additional stats for
debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from
evicted generations in this file.
Working set estimation
----------------------
Working set estimation measures how much memory an application needs
in a given time interval, and it is usually done with little impact on
the performance of the application. E.g., data centers want to
optimize job scheduling (bin packing) to improve memory utilizations.
When a new job comes in, the job scheduler needs to find out whether
each server it manages can allocate a certain amount of memory for
this new job before it can pick a candidate. To do so, the job
scheduler needs to estimate the working sets of the existing jobs.
When it is read, ``lru_gen`` returns a histogram of numbers of pages
accessed over different time intervals for each memcg and node.
``MAX_NR_GENS`` decides the number of bins for each histogram. The
histograms are noncumulative.
::
memcg memcg_id memcg_path
node node_id
min_gen_nr age_in_ms nr_anon_pages nr_file_pages
...
max_gen_nr age_in_ms nr_anon_pages nr_file_pages
Each bin contains an estimated number of pages that have been accessed
within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages
and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of
the former is the largest and that of the latter is the smallest.
Users can write the following command to ``lru_gen`` to create a new
generation ``max_gen_nr+1``:
``+ memcg_id node_id max_gen_nr [can_swap [force_scan]]``
``can_swap`` defaults to the swap setting and, if it is set to ``1``,
it forces the scan of anon pages when swap is off, and vice versa.
``force_scan`` defaults to ``1`` and, if it is set to ``0``, it
employs heuristics to reduce the overhead, which is likely to reduce
the coverage as well.
A typical use case is that a job scheduler runs this command at a
certain time interval to create new generations, and it ranks the
servers it manages based on the sizes of their cold pages defined by
this time interval.
Proactive reclaim
-----------------
Proactive reclaim induces page reclaim when there is no memory
pressure. It usually targets cold pages only. E.g., when a new job
comes in, the job scheduler wants to proactively reclaim cold pages on
the server it selected, to improve the chance of successfully landing
this new job.
Users can write the following command to ``lru_gen`` to evict
generations less than or equal to ``min_gen_nr``.
``- memcg_id node_id min_gen_nr [swappiness [nr_to_reclaim]]``
``min_gen_nr`` should be less than ``max_gen_nr-1``, since
``max_gen_nr`` and ``max_gen_nr-1`` are not fully aged (equivalent to
the active list) and therefore cannot be evicted. ``swappiness``
overrides the default value in ``/proc/sys/vm/swappiness``.
``nr_to_reclaim`` limits the number of pages to evict.
A typical use case is that a job scheduler runs this command before it
tries to land a new job on a server. If it fails to materialize enough
cold pages because of the overestimation, it retries on the next
server according to the ranking result obtained from the working set
estimation step. This less forceful approach limits the impacts on the
existing jobs.

View File

@@ -267,6 +267,17 @@ domain names are in general different. For a detailed discussion
see the ``hostname(1)`` man page. see the ``hostname(1)`` man page.
export_pmu_events (arm64 only)
==============================
Controls the PMU export bit (PMCR_EL0.X), which enables the exporting of
events over an IMPLEMENTATION DEFINED PMU event export bus to another device.
0: disables exporting of events (default).
1: enables exporting of events.
firmware_config firmware_config
=============== ===============
@@ -915,6 +926,17 @@ enabled, otherwise writing to this file will return ``-EBUSY``.
The default value is 8. The default value is 8.
perf_user_access (arm64 only)
=================================
Controls user space access for reading perf event counters. When set to 1,
user space can read performance monitor counter registers directly.
The default value is 0 (access disabled).
See Documentation/arm64/perf.rst for more information.
pid_max pid_max
======= =======

View File

@@ -92,7 +92,7 @@ operation if the source belongs to the supported system register space.
The infrastructure emulates only the following system register space:: The infrastructure emulates only the following system register space::
Op0=3, Op1=0, CRn=0, CRm=0,4,5,6,7 Op0=3, Op1=0, CRn=0, CRm=0,2,3,4,5,6,7
(See Table C5-6 'System instruction encodings for non-Debug System (See Table C5-6 'System instruction encodings for non-Debug System
register accesses' in ARMv8 ARM DDI 0487A.h, for the list of register accesses' in ARMv8 ARM DDI 0487A.h, for the list of
@@ -290,6 +290,44 @@ infrastructure:
+------------------------------+---------+---------+ +------------------------------+---------+---------+
| RPRES | [7-4] | y | | RPRES | [7-4] | y |
+------------------------------+---------+---------+ +------------------------------+---------+---------+
| WFXT | [3-0] | y |
+------------------------------+---------+---------+
10) MVFR0_EL1 - AArch32 Media and VFP Feature Register 0
+------------------------------+---------+---------+
| Name | bits | visible |
+------------------------------+---------+---------+
| FPDP | [11-8] | y |
+------------------------------+---------+---------+
11) MVFR1_EL1 - AArch32 Media and VFP Feature Register 1
+------------------------------+---------+---------+
| Name | bits | visible |
+------------------------------+---------+---------+
| SIMDFMAC | [31-28] | y |
+------------------------------+---------+---------+
| SIMDSP | [19-16] | y |
+------------------------------+---------+---------+
| SIMDInt | [15-12] | y |
+------------------------------+---------+---------+
| SIMDLS | [11-8] | y |
+------------------------------+---------+---------+
12) ID_ISAR5_EL1 - AArch32 Instruction Set Attribute Register 5
+------------------------------+---------+---------+
| Name | bits | visible |
+------------------------------+---------+---------+
| CRC32 | [19-16] | y |
+------------------------------+---------+---------+
| SHA2 | [15-12] | y |
+------------------------------+---------+---------+
| SHA1 | [11-8] | y |
+------------------------------+---------+---------+
| AES | [7-4] | y |
+------------------------------+---------+---------+
Appendix I: Example Appendix I: Example

View File

@@ -259,6 +259,48 @@ HWCAP2_RPRES
Functionality implied by ID_AA64ISAR2_EL1.RPRES == 0b0001. Functionality implied by ID_AA64ISAR2_EL1.RPRES == 0b0001.
HWCAP2_MTE3
Functionality implied by ID_AA64PFR1_EL1.MTE == 0b0011, as described
by Documentation/arm64/memory-tagging-extension.rst.
HWCAP2_SME
Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described
by Documentation/arm64/sme.rst.
HWCAP2_SME_I16I64
Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111.
HWCAP2_SME_F64F64
Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1.
HWCAP2_SME_I8I32
Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111.
HWCAP2_SME_F16F32
Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1.
HWCAP2_SME_B16F32
Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1.
HWCAP2_SME_F32F32
Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1.
HWCAP2_SME_FA64
Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.
HWCAP2_WFXT
Functionality implied by ID_AA64ISAR2_EL1.WFXT == 0b0010.
4. Unused AT_HWCAP bits 4. Unused AT_HWCAP bits
----------------------- -----------------------

View File

@@ -21,6 +21,7 @@ ARM64 Architecture
perf perf
pointer-authentication pointer-authentication
silicon-errata silicon-errata
sme
sve sve
tagged-address-abi tagged-address-abi
tagged-pointers tagged-pointers

View File

@@ -76,6 +76,9 @@ configurable behaviours:
with ``.si_code = SEGV_MTEAERR`` and ``.si_addr = 0`` (the faulting with ``.si_code = SEGV_MTEAERR`` and ``.si_addr = 0`` (the faulting
address is unknown). address is unknown).
- *Asymmetric* - Reads are handled as for synchronous mode while writes
are handled as for asynchronous mode.
The user can select the above modes, per thread, using the The user can select the above modes, per thread, using the
``prctl(PR_SET_TAGGED_ADDR_CTRL, flags, 0, 0, 0)`` system call where ``flags`` ``prctl(PR_SET_TAGGED_ADDR_CTRL, flags, 0, 0, 0)`` system call where ``flags``
contains any number of the following values in the ``PR_MTE_TCF_MASK`` contains any number of the following values in the ``PR_MTE_TCF_MASK``
@@ -139,18 +142,25 @@ tag checking mode as the CPU's preferred tag checking mode.
The preferred tag checking mode for each CPU is controlled by The preferred tag checking mode for each CPU is controlled by
``/sys/devices/system/cpu/cpu<N>/mte_tcf_preferred``, to which a ``/sys/devices/system/cpu/cpu<N>/mte_tcf_preferred``, to which a
privileged user may write the value ``async`` or ``sync``. The default privileged user may write the value ``async``, ``sync`` or ``asymm``. The
preferred mode for each CPU is ``async``. default preferred mode for each CPU is ``async``.
To allow a program to potentially run in the CPU's preferred tag To allow a program to potentially run in the CPU's preferred tag
checking mode, the user program may set multiple tag check fault mode checking mode, the user program may set multiple tag check fault mode
bits in the ``flags`` argument to the ``prctl(PR_SET_TAGGED_ADDR_CTRL, bits in the ``flags`` argument to the ``prctl(PR_SET_TAGGED_ADDR_CTRL,
flags, 0, 0, 0)`` system call. If the CPU's preferred tag checking flags, 0, 0, 0)`` system call. If both synchronous and asynchronous
mode is in the task's set of provided tag checking modes (this will modes are requested then asymmetric mode may also be selected by the
always be the case at present because the kernel only supports two kernel. If the CPU's preferred tag checking mode is in the task's set
tag checking modes, but future kernels may support more modes), that of provided tag checking modes, that mode will be selected. Otherwise,
mode will be selected. Otherwise, one of the modes in the task's mode one of the modes in the task's mode will be selected by the kernel
set will be selected in a currently unspecified manner. from the task's mode set using the preference order:
1. Asynchronous
2. Asymmetric
3. Synchronous
Note that there is no way for userspace to request multiple modes and
also disable asymmetric mode.
Initial process state Initial process state
--------------------- ---------------------

View File

@@ -102,12 +102,26 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 | | ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 |
+----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A510 | #2658417 | ARM64_ERRATUM_2658417 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A710 | #2224489 | ARM64_ERRATUM_2224489 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 | | ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 |
+----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N1 | #1349291 | N/A | | ARM | Neoverse-N1 | #1349291 | N/A |
+----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N1 | #1542419 | ARM64_ERRATUM_1542419 | | ARM | Neoverse-N1 | #1542419 | ARM64_ERRATUM_1542419 |
+----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N2 | #2139208 | ARM64_ERRATUM_2139208 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N2 | #2067961 | ARM64_ERRATUM_2067961 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N2 | #2253138 | ARM64_ERRATUM_2253138 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | MMU-500 | #841119,826419 | N/A | | ARM | MMU-500 | #841119,826419 | N/A |
+----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+

428
Documentation/arm64/sme.rst Normal file
View File

@@ -0,0 +1,428 @@
===================================================
Scalable Matrix Extension support for AArch64 Linux
===================================================
This document outlines briefly the interface provided to userspace by Linux in
order to support use of the ARM Scalable Matrix Extension (SME).
This is an outline of the most important features and issues only and not
intended to be exhaustive. It should be read in conjunction with the SVE
documentation in sve.rst which provides details on the Streaming SVE mode
included in SME.
This document does not aim to describe the SME architecture or programmer's
model. To aid understanding, a minimal description of relevant programmer's
model features for SME is included in Appendix A.
1. General
-----------
* PSTATE.SM, PSTATE.ZA, the streaming mode vector length, the ZA
register state and TPIDR2_EL0 are tracked per thread.
* The presence of SME is reported to userspace via HWCAP2_SME in the aux vector
AT_HWCAP2 entry. Presence of this flag implies the presence of the SME
instructions and registers, and the Linux-specific system interfaces
described in this document. SME is reported in /proc/cpuinfo as "sme".
* Support for the execution of SME instructions in userspace can also be
detected by reading the CPU ID register ID_AA64PFR1_EL1 using an MRS
instruction, and checking that the value of the SME field is nonzero. [3]
It does not guarantee the presence of the system interfaces described in the
following sections: software that needs to verify that those interfaces are
present must check for HWCAP2_SME instead.
* There are a number of optional SME features, presence of these is reported
through AT_HWCAP2 through:
HWCAP2_SME_I16I64
HWCAP2_SME_F64F64
HWCAP2_SME_I8I32
HWCAP2_SME_F16F32
HWCAP2_SME_B16F32
HWCAP2_SME_F32F32
HWCAP2_SME_FA64
This list may be extended over time as the SME architecture evolves.
These extensions are also reported via the CPU ID register ID_AA64SMFR0_EL1,
which userspace can read using an MRS instruction. See elf_hwcaps.txt and
cpu-feature-registers.txt for details.
* Debuggers should restrict themselves to interacting with the target via the
NT_ARM_SVE, NT_ARM_SSVE and NT_ARM_ZA regsets. The recommended way
of detecting support for these regsets is to connect to a target process
first and then attempt a
ptrace(PTRACE_GETREGSET, pid, NT_ARM_<regset>, &iov).
* Whenever ZA register values are exchanged in memory between userspace and
the kernel, the register value is encoded in memory as a series of horizontal
vectors from 0 to VL/8-1 stored in the same endianness invariant format as is
used for SVE vectors.
* On thread creation TPIDR2_EL0 is preserved unless CLONE_SETTLS is specified,
in which case it is set to 0.
2. Vector lengths
------------------
SME defines a second vector length similar to the SVE vector length which is
controls the size of the streaming mode SVE vectors and the ZA matrix array.
The ZA matrix is square with each side having as many bytes as a streaming
mode SVE vector.
3. Sharing of streaming and non-streaming mode SVE state
---------------------------------------------------------
It is implementation defined which if any parts of the SVE state are shared
between streaming and non-streaming modes. When switching between modes
via software interfaces such as ptrace if no register content is provided as
part of switching no state will be assumed to be shared and everything will
be zeroed.
4. System call behaviour
-------------------------
* On syscall PSTATE.ZA is preserved, if PSTATE.ZA==1 then the contents of the
ZA matrix are preserved.
* On syscall PSTATE.SM will be cleared and the SVE registers will be handled
as per the standard SVE ABI.
* Neither the SVE registers nor ZA are used to pass arguments to or receive
results from any syscall.
* On process creation (eg, clone()) the newly created process will have
PSTATE.SM cleared.
* All other SME state of a thread, including the currently configured vector
length, the state of the PR_SME_VL_INHERIT flag, and the deferred vector
length (if any), is preserved across all syscalls, subject to the specific
exceptions for execve() described in section 6.
5. Signal handling
-------------------
* Signal handlers are invoked with streaming mode and ZA disabled.
* A new signal frame record za_context encodes the ZA register contents on
signal delivery. [1]
* The signal frame record for ZA always contains basic metadata, in particular
the thread's vector length (in za_context.vl).
* The ZA matrix may or may not be included in the record, depending on
the value of PSTATE.ZA. The registers are present if and only if:
za_context.head.size >= ZA_SIG_CONTEXT_SIZE(sve_vq_from_vl(za_context.vl))
in which case PSTATE.ZA == 1.
* If matrix data is present, the remainder of the record has a vl-dependent
size and layout. Macros ZA_SIG_* are defined [1] to facilitate access to
them.
* The matrix is stored as a series of horizontal vectors in the same format as
is used for SVE vectors.
* If the ZA context is too big to fit in sigcontext.__reserved[], then extra
space is allocated on the stack, an extra_context record is written in
__reserved[] referencing this space. za_context is then written in the
extra space. Refer to [1] for further details about this mechanism.
5. Signal return
-----------------
When returning from a signal handler:
* If there is no za_context record in the signal frame, or if the record is
present but contains no register data as described in the previous section,
then ZA is disabled.
* If za_context is present in the signal frame and contains matrix data then
PSTATE.ZA is set to 1 and ZA is populated with the specified data.
* The vector length cannot be changed via signal return. If za_context.vl in
the signal frame does not match the current vector length, the signal return
attempt is treated as illegal, resulting in a forced SIGSEGV.
6. prctl extensions
--------------------
Some new prctl() calls are added to allow programs to manage the SME vector
length:
prctl(PR_SME_SET_VL, unsigned long arg)
Sets the vector length of the calling thread and related flags, where
arg == vl | flags. Other threads of the calling process are unaffected.
vl is the desired vector length, where sve_vl_valid(vl) must be true.
flags:
PR_SME_VL_INHERIT
Inherit the current vector length across execve(). Otherwise, the
vector length is reset to the system default at execve(). (See
Section 9.)
PR_SME_SET_VL_ONEXEC
Defer the requested vector length change until the next execve()
performed by this thread.
The effect is equivalent to implicit execution of the following
call immediately after the next execve() (if any) by the thread:
prctl(PR_SME_SET_VL, arg & ~PR_SME_SET_VL_ONEXEC)
This allows launching of a new program with a different vector
length, while avoiding runtime side effects in the caller.
Without PR_SME_SET_VL_ONEXEC, the requested change takes effect
immediately.
Return value: a nonnegative on success, or a negative value on error:
EINVAL: SME not supported, invalid vector length requested, or
invalid flags.
On success:
* Either the calling thread's vector length or the deferred vector length
to be applied at the next execve() by the thread (dependent on whether
PR_SME_SET_VL_ONEXEC is present in arg), is set to the largest value
supported by the system that is less than or equal to vl. If vl ==
SVE_VL_MAX, the value set will be the largest value supported by the
system.
* Any previously outstanding deferred vector length change in the calling
thread is cancelled.
* The returned value describes the resulting configuration, encoded as for
PR_SME_GET_VL. The vector length reported in this value is the new
current vector length for this thread if PR_SME_SET_VL_ONEXEC was not
present in arg; otherwise, the reported vector length is the deferred
vector length that will be applied at the next execve() by the calling
thread.
* Changing the vector length causes all of ZA, P0..P15, FFR and all bits of
Z0..Z31 except for Z0 bits [127:0] .. Z31 bits [127:0] to become
unspecified, including both streaming and non-streaming SVE state.
Calling PR_SME_SET_VL with vl equal to the thread's current vector
length, or calling PR_SME_SET_VL with the PR_SVE_SET_VL_ONEXEC flag,
does not constitute a change to the vector length for this purpose.
* Changing the vector length causes PSTATE.ZA and PSTATE.SM to be cleared.
Calling PR_SME_SET_VL with vl equal to the thread's current vector
length, or calling PR_SME_SET_VL with the PR_SVE_SET_VL_ONEXEC flag,
does not constitute a change to the vector length for this purpose.
prctl(PR_SME_GET_VL)
Gets the vector length of the calling thread.
The following flag may be OR-ed into the result:
PR_SME_VL_INHERIT
Vector length will be inherited across execve().
There is no way to determine whether there is an outstanding deferred
vector length change (which would only normally be the case between a
fork() or vfork() and the corresponding execve() in typical use).
To extract the vector length from the result, bitwise and it with
PR_SME_VL_LEN_MASK.
Return value: a nonnegative value on success, or a negative value on error:
EINVAL: SME not supported.
7. ptrace extensions
---------------------
* A new regset NT_ARM_SSVE is defined for access to streaming mode SVE
state via PTRACE_GETREGSET and PTRACE_SETREGSET, this is documented in
sve.rst.
* A new regset NT_ARM_ZA is defined for ZA state for access to ZA state via
PTRACE_GETREGSET and PTRACE_SETREGSET.
Refer to [2] for definitions.
The regset data starts with struct user_za_header, containing:
size
Size of the complete regset, in bytes.
This depends on vl and possibly on other things in the future.
If a call to PTRACE_GETREGSET requests less data than the value of
size, the caller can allocate a larger buffer and retry in order to
read the complete regset.
max_size
Maximum size in bytes that the regset can grow to for the target
thread. The regset won't grow bigger than this even if the target
thread changes its vector length etc.
vl
Target thread's current streaming vector length, in bytes.
max_vl
Maximum possible streaming vector length for the target thread.
flags
Zero or more of the following flags, which have the same
meaning and behaviour as the corresponding PR_SET_VL_* flags:
SME_PT_VL_INHERIT
SME_PT_VL_ONEXEC (SETREGSET only).
* The effects of changing the vector length and/or flags are equivalent to
those documented for PR_SME_SET_VL.
The caller must make a further GETREGSET call if it needs to know what VL is
actually set by SETREGSET, unless is it known in advance that the requested
VL is supported.
* The size and layout of the payload depends on the header fields. The
SME_PT_ZA_*() macros are provided to facilitate access to the data.
* In either case, for SETREGSET it is permissible to omit the payload, in which
case the vector length and flags are changed and PSTATE.ZA is set to 0
(along with any consequences of those changes). If a payload is provided
then PSTATE.ZA will be set to 1.
* For SETREGSET, if the requested VL is not supported, the effect will be the
same as if the payload were omitted, except that an EIO error is reported.
No attempt is made to translate the payload data to the correct layout
for the vector length actually set. It is up to the caller to translate the
payload layout for the actual VL and retry.
* The effect of writing a partial, incomplete payload is unspecified.
8. ELF coredump extensions
---------------------------
* NT_ARM_SSVE notes will be added to each coredump for
each thread of the dumped process. The contents will be equivalent to the
data that would have been read if a PTRACE_GETREGSET of the corresponding
type were executed for each thread when the coredump was generated.
* A NT_ARM_ZA note will be added to each coredump for each thread of the
dumped process. The contents will be equivalent to the data that would have
been read if a PTRACE_GETREGSET of NT_ARM_ZA were executed for each thread
when the coredump was generated.
9. System runtime configuration
--------------------------------
* To mitigate the ABI impact of expansion of the signal frame, a policy
mechanism is provided for administrators, distro maintainers and developers
to set the default vector length for userspace processes:
/proc/sys/abi/sme_default_vector_length
Writing the text representation of an integer to this file sets the system
default vector length to the specified value, unless the value is greater
than the maximum vector length supported by the system in which case the
default vector length is set to that maximum.
The result can be determined by reopening the file and reading its
contents.
At boot, the default vector length is initially set to 32 or the maximum
supported vector length, whichever is smaller and supported. This
determines the initial vector length of the init process (PID 1).
Reading this file returns the current system default vector length.
* At every execve() call, the new vector length of the new process is set to
the system default vector length, unless
* PR_SME_VL_INHERIT (or equivalently SME_PT_VL_INHERIT) is set for the
calling thread, or
* a deferred vector length change is pending, established via the
PR_SME_SET_VL_ONEXEC flag (or SME_PT_VL_ONEXEC).
* Modifying the system default vector length does not affect the vector length
of any existing process or thread that does not make an execve() call.
Appendix A. SME programmer's model (informative)
=================================================
This section provides a minimal description of the additions made by SVE to the
ARMv8-A programmer's model that are relevant to this document.
Note: This section is for information only and not intended to be complete or
to replace any architectural specification.
A.1. Registers
---------------
In A64 state, SME adds the following:
* A new mode, streaming mode, in which a subset of the normal FPSIMD and SVE
features are available. When supported EL0 software may enter and leave
streaming mode at any time.
For best system performance it is strongly encouraged for software to enable
streaming mode only when it is actively being used.
* A new vector length controlling the size of ZA and the Z registers when in
streaming mode, separately to the vector length used for SVE when not in
streaming mode. There is no requirement that either the currently selected
vector length or the set of vector lengths supported for the two modes in
a given system have any relationship. The streaming mode vector length
is referred to as SVL.
* A new ZA matrix register. This is a square matrix of SVLxSVL bits. Most
operations on ZA require that streaming mode be enabled but ZA can be
enabled without streaming mode in order to load, save and retain data.
For best system performance it is strongly encouraged for software to enable
ZA only when it is actively being used.
* Two new 1 bit fields in PSTATE which may be controlled via the SMSTART and
SMSTOP instructions or by access to the SVCR system register:
* PSTATE.ZA, if this is 1 then the ZA matrix is accessible and has valid
data while if it is 0 then ZA can not be accessed. When PSTATE.ZA is
changed from 0 to 1 all bits in ZA are cleared.
* PSTATE.SM, if this is 1 then the PE is in streaming mode. When the value
of PSTATE.SM is changed then it is implementation defined if the subset
of the floating point register bits valid in both modes may be retained.
Any other bits will be cleared.
References
==========
[1] arch/arm64/include/uapi/asm/sigcontext.h
AArch64 Linux signal ABI definitions
[2] arch/arm64/include/uapi/asm/ptrace.h
AArch64 Linux ptrace ABI definitions
[3] Documentation/arm64/cpu-feature-registers.rst

View File

@@ -7,7 +7,9 @@ Author: Dave Martin <Dave.Martin@arm.com>
Date: 4 August 2017 Date: 4 August 2017
This document outlines briefly the interface provided to userspace by Linux in This document outlines briefly the interface provided to userspace by Linux in
order to support use of the ARM Scalable Vector Extension (SVE). order to support use of the ARM Scalable Vector Extension (SVE), including
interactions with Streaming SVE mode added by the Scalable Matrix Extension
(SME).
This is an outline of the most important features and issues only and not This is an outline of the most important features and issues only and not
intended to be exhaustive. intended to be exhaustive.
@@ -23,6 +25,10 @@ model features for SVE is included in Appendix A.
* SVE registers Z0..Z31, P0..P15 and FFR and the current vector length VL, are * SVE registers Z0..Z31, P0..P15 and FFR and the current vector length VL, are
tracked per-thread. tracked per-thread.
* In streaming mode FFR is not accessible unless HWCAP2_SME_FA64 is present
in the system, when it is not supported and these interfaces are used to
access streaming mode FFR is read and written as zero.
* The presence of SVE is reported to userspace via HWCAP_SVE in the aux vector * The presence of SVE is reported to userspace via HWCAP_SVE in the aux vector
AT_HWCAP entry. Presence of this flag implies the presence of the SVE AT_HWCAP entry. Presence of this flag implies the presence of the SVE
instructions and registers, and the Linux-specific system interfaces instructions and registers, and the Linux-specific system interfaces
@@ -53,10 +59,19 @@ model features for SVE is included in Appendix A.
which userspace can read using an MRS instruction. See elf_hwcaps.txt and which userspace can read using an MRS instruction. See elf_hwcaps.txt and
cpu-feature-registers.txt for details. cpu-feature-registers.txt for details.
* On hardware that supports the SME extensions, HWCAP2_SME will also be
reported in the AT_HWCAP2 aux vector entry. Among other things SME adds
streaming mode which provides a subset of the SVE feature set using a
separate SME vector length and the same Z/V registers. See sme.rst
for more details.
* Debuggers should restrict themselves to interacting with the target via the * Debuggers should restrict themselves to interacting with the target via the
NT_ARM_SVE regset. The recommended way of detecting support for this regset NT_ARM_SVE regset. The recommended way of detecting support for this regset
is to connect to a target process first and then attempt a is to connect to a target process first and then attempt a
ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov). ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov). Note that when SME is
present and streaming SVE mode is in use the FPSIMD subset of registers
will be read via NT_ARM_SVE and NT_ARM_SVE writes will exit streaming mode
in the target.
* Whenever SVE scalable register values (Zn, Pn, FFR) are exchanged in memory * Whenever SVE scalable register values (Zn, Pn, FFR) are exchanged in memory
between userspace and the kernel, the register value is encoded in memory in between userspace and the kernel, the register value is encoded in memory in
@@ -126,6 +141,11 @@ the SVE instruction set architecture.
are only present in fpsimd_context. For convenience, the content of V0..V31 are only present in fpsimd_context. For convenience, the content of V0..V31
is duplicated between sve_context and fpsimd_context. is duplicated between sve_context and fpsimd_context.
* The record contains a flag field which includes a flag SVE_SIG_FLAG_SM which
if set indicates that the thread is in streaming mode and the vector length
and register data (if present) describe the streaming SVE data and vector
length.
* The signal frame record for SVE always contains basic metadata, in particular * The signal frame record for SVE always contains basic metadata, in particular
the thread's vector length (in sve_context.vl). the thread's vector length (in sve_context.vl).
@@ -170,6 +190,11 @@ When returning from a signal handler:
the signal frame does not match the current vector length, the signal return the signal frame does not match the current vector length, the signal return
attempt is treated as illegal, resulting in a forced SIGSEGV. attempt is treated as illegal, resulting in a forced SIGSEGV.
* It is permitted to enter or leave streaming mode by setting or clearing
the SVE_SIG_FLAG_SM flag but applications should take care to ensure that
when doing so sve_context.vl and any register data are appropriate for the
vector length in the new mode.
6. prctl extensions 6. prctl extensions
-------------------- --------------------
@@ -255,7 +280,7 @@ prctl(PR_SVE_GET_VL)
vector length change (which would only normally be the case between a vector length change (which would only normally be the case between a
fork() or vfork() and the corresponding execve() in typical use). fork() or vfork() and the corresponding execve() in typical use).
To extract the vector length from the result, and it with To extract the vector length from the result, bitwise and it with
PR_SVE_VL_LEN_MASK. PR_SVE_VL_LEN_MASK.
Return value: a nonnegative value on success, or a negative value on error: Return value: a nonnegative value on success, or a negative value on error:
@@ -265,8 +290,14 @@ prctl(PR_SVE_GET_VL)
7. ptrace extensions 7. ptrace extensions
--------------------- ---------------------
* A new regset NT_ARM_SVE is defined for use with PTRACE_GETREGSET and * New regsets NT_ARM_SVE and NT_ARM_SSVE are defined for use with
PTRACE_SETREGSET. PTRACE_GETREGSET and PTRACE_SETREGSET. NT_ARM_SSVE describes the
streaming mode SVE registers and NT_ARM_SVE describes the
non-streaming mode SVE registers.
In this description a register set is referred to as being "live" when
the target is in the appropriate streaming or non-streaming mode and is
using data beyond the subset shared with the FPSIMD Vn registers.
Refer to [2] for definitions. Refer to [2] for definitions.
@@ -297,7 +328,7 @@ The regset data starts with struct user_sve_header, containing:
flags flags
either at most one of
SVE_PT_REGS_FPSIMD SVE_PT_REGS_FPSIMD
@@ -331,6 +362,10 @@ The regset data starts with struct user_sve_header, containing:
SVE_PT_VL_ONEXEC (SETREGSET only). SVE_PT_VL_ONEXEC (SETREGSET only).
If neither FPSIMD nor SVE flags are provided then no register
payload is available, this is only possible when SME is implemented.
* The effects of changing the vector length and/or flags are equivalent to * The effects of changing the vector length and/or flags are equivalent to
those documented for PR_SVE_SET_VL. those documented for PR_SVE_SET_VL.
@@ -346,6 +381,13 @@ The regset data starts with struct user_sve_header, containing:
case only the vector length and flags are changed (along with any case only the vector length and flags are changed (along with any
consequences of those changes). consequences of those changes).
* In systems supporting SME when in streaming mode a GETREGSET for
NT_REG_SVE will return only the user_sve_header with no register data,
similarly a GETREGSET for NT_REG_SSVE will not return any register data
when not in streaming mode.
* A GETREGSET for NT_ARM_SSVE will never return SVE_PT_REGS_FPSIMD.
* For SETREGSET, if an SVE_PT_REGS_SVE payload is present and the * For SETREGSET, if an SVE_PT_REGS_SVE payload is present and the
requested VL is not supported, the effect will be the same as if the requested VL is not supported, the effect will be the same as if the
payload were omitted, except that an EIO error is reported. No payload were omitted, except that an EIO error is reported. No
@@ -355,17 +397,25 @@ The regset data starts with struct user_sve_header, containing:
unspecified. It is up to the caller to translate the payload layout unspecified. It is up to the caller to translate the payload layout
for the actual VL and retry. for the actual VL and retry.
* Where SME is implemented it is not possible to GETREGSET the register
state for normal SVE when in streaming mode, nor the streaming mode
register state when in normal mode, regardless of the implementation defined
behaviour of the hardware for sharing data between the two modes.
* Any SETREGSET of NT_ARM_SVE will exit streaming mode if the target was in
streaming mode and any SETREGSET of NT_ARM_SSVE will enter streaming mode
if the target was not in streaming mode.
* The effect of writing a partial, incomplete payload is unspecified. * The effect of writing a partial, incomplete payload is unspecified.
8. ELF coredump extensions 8. ELF coredump extensions
--------------------------- ---------------------------
* A NT_ARM_SVE note will be added to each coredump for each thread of the * NT_ARM_SVE and NT_ARM_SSVE notes will be added to each coredump for
dumped process. The contents will be equivalent to the data that would have each thread of the dumped process. The contents will be equivalent to the
been read if a PTRACE_GETREGSET of NT_ARM_SVE were executed for each thread data that would have been read if a PTRACE_GETREGSET of the corresponding
when the coredump was generated. type were executed for each thread when the coredump was generated.
9. System runtime configuration 9. System runtime configuration
-------------------------------- --------------------------------

View File

@@ -1,5 +1,7 @@
.. SPDX-License-Identifier: GPL-2.0 .. SPDX-License-Identifier: GPL-2.0
.. _inline_encryption:
================= =================
Inline Encryption Inline Encryption
================= =================
@@ -7,230 +9,269 @@ Inline Encryption
Background Background
========== ==========
Inline encryption hardware sits logically between memory and the disk, and can Inline encryption hardware sits logically between memory and disk, and can
en/decrypt data as it goes in/out of the disk. Inline encryption hardware has a en/decrypt data as it goes in/out of the disk. For each I/O request, software
fixed number of "keyslots" - slots into which encryption contexts (i.e. the can control exactly how the inline encryption hardware will en/decrypt the data
encryption key, encryption algorithm, data unit size) can be programmed by the in terms of key, algorithm, data unit size (the granularity of en/decryption),
kernel at any time. Each request sent to the disk can be tagged with the index and data unit number (a value that determines the initialization vector(s)).
of a keyslot (and also a data unit number to act as an encryption tweak), and
the inline encryption hardware will en/decrypt the data in the request with the
encryption context programmed into that keyslot. This is very different from
full disk encryption solutions like self encrypting drives/TCG OPAL/ATA
Security standards, since with inline encryption, any block on disk could be
encrypted with any encryption context the kernel chooses.
Some inline encryption hardware accepts all encryption parameters including raw
keys directly in low-level I/O requests. However, most inline encryption
hardware instead has a fixed number of "keyslots" and requires that the key,
algorithm, and data unit size first be programmed into a keyslot. Each
low-level I/O request then just contains a keyslot index and data unit number.
Note that inline encryption hardware is very different from traditional crypto
accelerators, which are supported through the kernel crypto API. Traditional
crypto accelerators operate on memory regions, whereas inline encryption
hardware operates on I/O requests. Thus, inline encryption hardware needs to be
managed by the block layer, not the kernel crypto API.
Inline encryption hardware is also very different from "self-encrypting drives",
such as those based on the TCG Opal or ATA Security standards. Self-encrypting
drives don't provide fine-grained control of encryption and provide no way to
verify the correctness of the resulting ciphertext. Inline encryption hardware
provides fine-grained control of encryption, including the choice of key and
initialization vector for each sector, and can be tested for correctness.
Objective Objective
========= =========
We want to support inline encryption (IE) in the kernel. We want to support inline encryption in the kernel. To make testing easier, we
To allow for testing, we also want a crypto API fallback when actual also want support for falling back to the kernel crypto API when actual inline
IE hardware is absent. We also want IE to work with layered devices encryption hardware is absent. We also want inline encryption to work with
like dm and loopback (i.e. we want to be able to use the IE hardware layered devices like device-mapper and loopback (i.e. we want to be able to use
of the underlying devices if present, or else fall back to crypto API the inline encryption hardware of the underlying devices if present, or else
en/decryption). fall back to crypto API en/decryption).
Constraints and notes Constraints and notes
===================== =====================
- IE hardware has a limited number of "keyslots" that can be programmed - We need a way for upper layers (e.g. filesystems) to specify an encryption
with an encryption context (key, algorithm, data unit size, etc.) at any time. context to use for en/decrypting a bio, and device drivers (e.g. UFSHCD) need
One can specify a keyslot in a data request made to the device, and the to be able to use that encryption context when they process the request.
device will en/decrypt the data using the encryption context programmed into Encryption contexts also introduce constraints on bio merging; the block layer
that specified keyslot. When possible, we want to make multiple requests with needs to be aware of these constraints.
the same encryption context share the same keyslot.
- We need a way for upper layers like filesystems to specify an encryption - Different inline encryption hardware has different supported algorithms,
context to use for en/decrypting a struct bio, and a device driver (like UFS) supported data unit sizes, maximum data unit numbers, etc. We call these
needs to be able to use that encryption context when it processes the bio. properties the "crypto capabilities". We need a way for device drivers to
advertise crypto capabilities to upper layers in a generic way.
- We need a way for device drivers to expose their inline encryption - Inline encryption hardware usually (but not always) requires that keys be
capabilities in a unified way to the upper layers. programmed into keyslots before being used. Since programming keyslots may be
slow and there may not be very many keyslots, we shouldn't just program the
key for every I/O request, but rather keep track of which keys are in the
keyslots and reuse an already-programmed keyslot when possible.
- Upper layers typically define a specific end-of-life for crypto keys, e.g.
when an encrypted directory is locked or when a crypto mapping is torn down.
At these times, keys are wiped from memory. We must provide a way for upper
layers to also evict keys from any keyslots they are present in.
Design - When possible, device-mapper devices must be able to pass through the inline
====== encryption support of their underlying devices. However, it doesn't make
sense for device-mapper devices to have keyslots themselves.
We add a struct bio_crypt_ctx to struct bio that can Basic design
represent an encryption context, because we need to be able to pass this ============
encryption context from the upper layers (like the fs layer) to the
device driver to act upon.
While IE hardware works on the notion of keyslots, the FS layer has no We introduce ``struct blk_crypto_key`` to represent an inline encryption key and
knowledge of keyslots - it simply wants to specify an encryption context to how it will be used. This includes the type of the key (standard or
use while en/decrypting a bio. hardware-wrapped); the actual bytes of the key; the size of the key; the
algorithm and data unit size the key will be used with; and the number of bytes
needed to represent the maximum data unit number the key will be used with.
We introduce a keyslot manager (KSM) that handles the translation from We introduce ``struct bio_crypt_ctx`` to represent an encryption context. It
encryption contexts specified by the FS to keyslots on the IE hardware. contains a data unit number and a pointer to a blk_crypto_key. We add pointers
This KSM also serves as the way IE hardware can expose its capabilities to to a bio_crypt_ctx to ``struct bio`` and ``struct request``; this allows users
upper layers. The generic mode of operation is: each device driver that wants of the block layer (e.g. filesystems) to provide an encryption context when
to support IE will construct a KSM and set it up in its struct request_queue. creating a bio and have it be passed down the stack for processing by the block
Upper layers that want to use IE on this device can then use this KSM in layer and device drivers. Note that the encryption context doesn't explicitly
the device's struct request_queue to translate an encryption context into say whether to encrypt or decrypt, as that is implicit from the direction of the
a keyslot. The presence of the KSM in the request queue shall be used to mean bio; WRITE means encrypt, and READ means decrypt.
that the device supports IE.
The KSM uses refcounts to track which keyslots are idle (either they have no We also introduce ``struct blk_crypto_profile`` to contain all generic inline
encryption context programmed, or there are no in-flight struct bios encryption-related state for a particular inline encryption device. The
referencing that keyslot). When a new encryption context needs a keyslot, it blk_crypto_profile serves as the way that drivers for inline encryption hardware
tries to find a keyslot that has already been programmed with the same advertise their crypto capabilities and provide certain functions (e.g.,
encryption context, and if there is no such keyslot, it evicts the least functions to program and evict keys) to upper layers. Each device driver that
recently used idle keyslot and programs the new encryption context into that wants to support inline encryption will construct a blk_crypto_profile, then
one. If no idle keyslots are available, then the caller will sleep until there associate it with the disk's request_queue.
is at least one.
The blk_crypto_profile also manages the hardware's keyslots, when applicable.
This happens in the block layer, so that users of the block layer can just
specify encryption contexts and don't need to know about keyslots at all, nor do
device drivers need to care about most details of keyslot management.
blk-mq changes, other block layer changes and blk-crypto-fallback Specifically, for each keyslot, the block layer (via the blk_crypto_profile)
================================================================= keeps track of which blk_crypto_key that keyslot contains (if any), and how many
in-flight I/O requests are using it. When the block layer creates a
``struct request`` for a bio that has an encryption context, it grabs a keyslot
that already contains the key if possible. Otherwise it waits for an idle
keyslot (a keyslot that isn't in-use by any I/O), then programs the key into the
least-recently-used idle keyslot using the function the device driver provided.
In both cases, the resulting keyslot is stored in the ``crypt_keyslot`` field of
the request, where it is then accessible to device drivers and is released after
the request completes.
We add a pointer to a ``bi_crypt_context`` and ``keyslot`` to ``struct request`` also contains a pointer to the original bio_crypt_ctx.
struct request. These will be referred to as the ``crypto fields`` Requests can be built from multiple bios, and the block layer must take the
for the request. This ``keyslot`` is the keyslot into which the encryption context into account when trying to merge bios and requests. For two
``bi_crypt_context`` has been programmed in the KSM of the ``request_queue`` bios/requests to be merged, they must have compatible encryption contexts: both
that this request is being sent to. unencrypted, or both encrypted with the same key and contiguous data unit
numbers. Only the encryption context for the first bio in a request is
retained, since the remaining bios have been verified to be merge-compatible
with the first bio.
We introduce ``block/blk-crypto-fallback.c``, which allows upper layers to remain To make it possible for inline encryption to work with request_queue based
blissfully unaware of whether or not real inline encryption hardware is present layered devices, when a request is cloned, its encryption context is cloned as
underneath. When a bio is submitted with a target ``request_queue`` that doesn't well. When the cloned request is submitted, it is then processed as usual; this
support the encryption context specified with the bio, the block layer will includes getting a keyslot from the clone's target device if needed.
en/decrypt the bio with the blk-crypto-fallback.
If the bio is a ``WRITE`` bio, a bounce bio is allocated, and the data in the bio blk-crypto-fallback
is encrypted stored in the bounce bio - blk-mq will then proceed to process the ===================
bounce bio as if it were not encrypted at all (except when blk-integrity is
concerned). ``blk-crypto-fallback`` sets the bounce bio's ``bi_end_io`` to an
internal function that cleans up the bounce bio and ends the original bio.
If the bio is a ``READ`` bio, the bio's ``bi_end_io`` (and also ``bi_private``) It is desirable for the inline encryption support of upper layers (e.g.
is saved and overwritten by ``blk-crypto-fallback`` to filesystems) to be testable without real inline encryption hardware, and
``bio_crypto_fallback_decrypt_bio``. The bio's ``bi_crypt_context`` is also likewise for the block layer's keyslot management logic. It is also desirable
overwritten with ``NULL``, so that to the rest of the stack, the bio looks to allow upper layers to just always use inline encryption rather than have to
as if it was a regular bio that never had an encryption context specified. implement encryption in multiple ways.
``bio_crypto_fallback_decrypt_bio`` will decrypt the bio, restore the original
``bi_end_io`` (and also ``bi_private``) and end the bio again.
Regardless of whether real inline encryption hardware is used or the Therefore, we also introduce *blk-crypto-fallback*, which is an implementation
of inline encryption using the kernel crypto API. blk-crypto-fallback is built
into the block layer, so it works on any block device without any special setup.
Essentially, when a bio with an encryption context is submitted to a
block_device that doesn't support that encryption context, the block layer will
handle en/decryption of the bio using blk-crypto-fallback.
For encryption, the data cannot be encrypted in-place, as callers usually rely
on it being unmodified. Instead, blk-crypto-fallback allocates bounce pages,
fills a new bio with those bounce pages, encrypts the data into those bounce
pages, and submits that "bounce" bio. When the bounce bio completes,
blk-crypto-fallback completes the original bio. If the original bio is too
large, multiple bounce bios may be required; see the code for details.
For decryption, blk-crypto-fallback "wraps" the bio's completion callback
(``bi_complete``) and private data (``bi_private``) with its own, unsets the
bio's encryption context, then submits the bio. If the read completes
successfully, blk-crypto-fallback restores the bio's original completion
callback and private data, then decrypts the bio's data in-place using the
kernel crypto API. Decryption happens from a workqueue, as it may sleep.
Afterwards, blk-crypto-fallback completes the bio.
In both cases, the bios that blk-crypto-fallback submits no longer have an
encryption context. Therefore, lower layers only see standard unencrypted I/O.
blk-crypto-fallback also defines its own blk_crypto_profile and has its own
"keyslots"; its keyslots contain ``struct crypto_skcipher`` objects. The reason
for this is twofold. First, it allows the keyslot management logic to be tested
without actual inline encryption hardware. Second, similar to actual inline
encryption hardware, the crypto API doesn't accept keys directly in requests but
rather requires that keys be set ahead of time, and setting keys can be
expensive; moreover, allocating a crypto_skcipher can't happen on the I/O path
at all due to the locks it takes. Therefore, the concept of keyslots still
makes sense for blk-crypto-fallback.
Note that regardless of whether real inline encryption hardware or
blk-crypto-fallback is used, the ciphertext written to disk (and hence the blk-crypto-fallback is used, the ciphertext written to disk (and hence the
on-disk format of data) will be the same (assuming the hardware's implementation on-disk format of data) will be the same (assuming that both the inline
of the algorithm being used adheres to spec and functions correctly). encryption hardware's implementation and the kernel crypto API's implementation
of the algorithm being used adhere to spec and function correctly).
If a ``request queue``'s inline encryption hardware claimed to support the
encryption context specified with a bio, then it will not be handled by the
``blk-crypto-fallback``. We will eventually reach a point in blk-mq when a
struct request needs to be allocated for that bio. At that point,
blk-mq tries to program the encryption context into the ``request_queue``'s
keyslot_manager, and obtain a keyslot, which it stores in its newly added
``keyslot`` field. This keyslot is released when the request is completed.
When the first bio is added to a request, ``blk_crypto_rq_bio_prep`` is called,
which sets the request's ``crypt_ctx`` to a copy of the bio's
``bi_crypt_context``. bio_crypt_do_front_merge is called whenever a subsequent
bio is merged to the front of the request, which updates the ``crypt_ctx`` of
the request so that it matches the newly merged bio's ``bi_crypt_context``. In particular, the request keeps a copy of the ``bi_crypt_context`` of the first
bio in its bio-list (blk-mq needs to be careful to maintain this invariant
during bio and request merges).
To make it possible for inline encryption to work with request queue based
layered devices, when a request is cloned, its ``crypto fields`` are cloned as
well. When the cloned request is submitted, blk-mq programs the
``bi_crypt_context`` of the request into the clone's request_queue's keyslot
manager, and stores the returned keyslot in the clone's ``keyslot``.
blk-crypto-fallback is optional and is controlled by the
``CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK`` kernel configuration option.
API presented to users of the block layer API presented to users of the block layer
========================================= =========================================
``struct blk_crypto_key`` represents a crypto key (the raw key, size of the ``blk_crypto_config_supported()`` allows users to check ahead of time whether
key, the crypto algorithm to use, the data unit size to use, and the number of inline encryption with particular crypto settings will work on a particular
bytes required to represent data unit numbers that will be specified with the block_device -- either via hardware or via blk-crypto-fallback. This function
``bi_crypt_context``). takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits
the actual bytes of the key and instead just contains the algorithm, data unit
size, etc. This function can be useful if blk-crypto-fallback is disabled.
``blk_crypto_init_key`` allows upper layers to initialize such a ``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key.
``blk_crypto_key``.
``bio_crypt_set_ctx`` should be called on any bio that a user of Users must call ``blk_crypto_start_using_key()`` before actually starting to use
the block layer wants en/decrypted via inline encryption (or the a blk_crypto_key on a block_device (even if ``blk_crypto_config_supported()``
blk-crypto-fallback, if hardware support isn't available for the desired was called earlier). This is needed to initialize blk-crypto-fallback if it
crypto configuration). This function takes the ``blk_crypto_key`` and the will be needed. This must not be called from the data path, as this may have to
data unit number (DUN) to use when en/decrypting the bio. allocate resources, which may deadlock in that case.
``blk_crypto_config_supported`` allows upper layers to query whether or not the Next, to attach an encryption context to a bio, users should call
an encryption context passed to request queue can be handled by blk-crypto ``bio_crypt_set_ctx()``. This function allocates a bio_crypt_ctx and attaches
(either by real inline encryption hardware, or by the blk-crypto-fallback). it to a bio, given the blk_crypto_key and the data unit number that will be used
This is useful e.g. when blk-crypto-fallback is disabled, and the upper layer for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx
wants to use an algorithm that may not supported by hardware - this function later, as that happens automatically when the bio is freed or reset.
lets the upper layer know ahead of time that the algorithm isn't supported,
and the upper layer can fallback to something else if appropriate.
``blk_crypto_start_using_key`` - Upper layers must call this function on Finally, when done using inline encryption with a blk_crypto_key on a
``blk_crypto_key`` and a ``request_queue`` before using the key with any bio block_device, users must call ``blk_crypto_evict_key()``. This ensures that
headed for that ``request_queue``. This function ensures that either the the key is evicted from all keyslots it may be programmed into and unlinked from
hardware supports the key's crypto settings, or the crypto API fallback has any kernel data structures it may be linked into.
transforms for the needed mode allocated and ready to go. Note that this
function may allocate an ``skcipher``, and must not be called from the data
path, since allocating ``skciphers`` from the data path can deadlock.
``blk_crypto_evict_key`` *must* be called by upper layers before a In summary, for users of the block layer, the lifecycle of a blk_crypto_key is
``blk_crypto_key`` is freed. Further, it *must* only be called only once as follows:
there are no more in-flight requests that use that ``blk_crypto_key``.
``blk_crypto_evict_key`` will ensure that a key is removed from any keyslots in 1. ``blk_crypto_config_supported()`` (optional)
inline encryption hardware that the key might have been programmed into (or the blk-crypto-fallback). 2. ``blk_crypto_init_key()``
3. ``blk_crypto_start_using_key()``
4. ``bio_crypt_set_ctx()`` (potentially many times)
5. ``blk_crypto_evict_key()`` (after all I/O has completed)
6. Zeroize the blk_crypto_key (this has no dedicated function)
If a blk_crypto_key is being used on multiple block_devices, then
``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``,
and ``blk_crypto_evict_key()`` must be called on each block_device.
API presented to device drivers API presented to device drivers
=============================== ===============================
A :c:type:``struct blk_keyslot_manager`` should be set up by device drivers in A device driver that wants to support inline encryption must set up a
the ``request_queue`` of the device. The device driver needs to call blk_crypto_profile in the request_queue of its device. To do this, it first
``blk_ksm_init`` (or its resource-managed variant ``devm_blk_ksm_init``) on the must call ``blk_crypto_profile_init()`` (or its resource-managed variant
``blk_keyslot_manager``, while specifying the number of keyslots supported by ``devm_blk_crypto_profile_init()``), providing the number of keyslots.
the hardware.
The device driver also needs to tell the KSM how to actually manipulate the Next, it must advertise its crypto capabilities by setting fields in the
IE hardware in the device to do things like programming the crypto key into blk_crypto_profile, e.g. ``modes_supported`` and ``max_dun_bytes_supported``.
the IE hardware into a particular keyslot. All this is achieved through the
struct blk_ksm_ll_ops field in the KSM that the device driver
must fill up after initing the ``blk_keyslot_manager``.
The KSM also handles runtime power management for the device when applicable It then must set function pointers in the ``ll_ops`` field of the
(e.g. when it wants to program a crypto key into the IE hardware, the device blk_crypto_profile to tell upper layers how to control the inline encryption
must be runtime powered on) - so the device driver must also set the ``dev`` hardware, e.g. how to program and evict keyslots. Most drivers will need to
field in the ksm to point to the `struct device` for the KSM to use for runtime implement ``keyslot_program`` and ``keyslot_evict``. For details, see the
power management. comments for ``struct blk_crypto_ll_ops``.
``blk_ksm_reprogram_all_keys`` can be called by device drivers if the device Once the driver registers a blk_crypto_profile with a request_queue, I/O
needs each and every of its keyslots to be reprogrammed with the key it requests the driver receives via that queue may have an encryption context. All
"should have" at the point in time when the function is called. This is useful encryption contexts will be compatible with the crypto capabilities declared in
e.g. if a device loses all its keys on runtime power down/up. the blk_crypto_profile, so drivers don't need to worry about handling
unsupported requests. Also, if a nonzero number of keyslots was declared in the
blk_crypto_profile, then all I/O requests that have an encryption context will
also have a keyslot which was already programmed with the appropriate key.
If the driver used ``blk_ksm_init`` instead of ``devm_blk_ksm_init``, then If the driver implements runtime suspend and its blk_crypto_ll_ops don't work
``blk_ksm_destroy`` should be called to free up all resources used by a while the device is runtime-suspended, then the driver must also set the ``dev``
``blk_keyslot_manager`` once it is no longer needed. field of the blk_crypto_profile to point to the ``struct device`` that will be
resumed before any of the low-level operations are called.
If there are situations where the inline encryption hardware loses the contents
of its keyslots, e.g. device resets, the driver must handle reprogramming the
keyslots. To do this, the driver may call ``blk_crypto_reprogram_all_keys()``.
Finally, if the driver used ``blk_crypto_profile_init()`` instead of
``devm_blk_crypto_profile_init()``, then it is responsible for calling
``blk_crypto_profile_destroy()`` when the crypto profile is no longer needed.
Layered Devices Layered Devices
=============== ===============
Request queue based layered devices like dm-rq that wish to support IE need to Request queue based layered devices like dm-rq that wish to support inline
create their own keyslot manager for their request queue, and expose whatever encryption need to create their own blk_crypto_profile for their request_queue,
functionality they choose. When a layered device wants to pass a clone of that and expose whatever functionality they choose. When a layered device wants to
request to another ``request_queue``, blk-crypto will initialize and prepare the pass a clone of that request to another request_queue, blk-crypto will
clone as necessary - see ``blk_crypto_insert_cloned_request`` in initialize and prepare the clone as necessary; see
``blk-crypto.c``. ``blk_crypto_insert_cloned_request()``.
Future Optimizations for layered devices
========================================
Creating a keyslot manager for a layered device uses up memory for each
keyslot, and in general, a layered device merely passes the request on to a
"child" device, so the keyslots in the layered device itself are completely
unused, and don't need any refcounting or keyslot programming. We can instead
define a new type of KSM; the "passthrough KSM", that layered devices can use
to advertise an unlimited number of keyslots, and support for any encryption
algorithms they choose, while not actually using any memory for each keyslot.
Another use case for the "passthrough KSM" is for IE devices that do not have a
limited number of keyslots.
Interaction between inline encryption and blk integrity Interaction between inline encryption and blk integrity
======================================================= =======================================================
@@ -257,7 +298,220 @@ Because there isn't any real hardware yet, it seems prudent to assume that
hardware implementations might not implement both features together correctly, hardware implementations might not implement both features together correctly,
and disallow the combination for now. Whenever a device supports integrity, the and disallow the combination for now. Whenever a device supports integrity, the
kernel will pretend that the device does not support hardware inline encryption kernel will pretend that the device does not support hardware inline encryption
(by essentially setting the keyslot manager in the request_queue of the device (by setting the blk_crypto_profile in the request_queue of the device to NULL).
to NULL). When the crypto API fallback is enabled, this means that all bios with When the crypto API fallback is enabled, this means that all bios with and
and encryption context will use the fallback, and IO will complete as usual. encryption context will use the fallback, and IO will complete as usual. When
When the fallback is disabled, a bio with an encryption context will be failed. the fallback is disabled, a bio with an encryption context will be failed.
.. _hardware_wrapped_keys:
Hardware-wrapped keys
=====================
Motivation and threat model
---------------------------
Linux storage encryption (dm-crypt, fscrypt, eCryptfs, etc.) traditionally
relies on the raw encryption key(s) being present in kernel memory so that the
encryption can be performed. This traditionally isn't seen as a problem because
the key(s) won't be present during an offline attack, which is the main type of
attack that storage encryption is intended to protect from.
However, there is an increasing desire to also protect users' data from other
types of attacks (to the extent possible), including:
- Cold boot attacks, where an attacker with physical access to a system suddenly
powers it off, then immediately dumps the system memory to extract recently
in-use encryption keys, then uses these keys to decrypt user data on-disk.
- Online attacks where the attacker is able to read kernel memory without fully
compromising the system, followed by an offline attack where any extracted
keys can be used to decrypt user data on-disk. An example of such an online
attack would be if the attacker is able to run some code on the system that
exploits a Meltdown-like vulnerability but is unable to escalate privileges.
- Online attacks where the attacker fully compromises the system, but their data
exfiltration is significantly time-limited and/or bandwidth-limited, so in
order to completely exfiltrate the data they need to extract the encryption
keys to use in a later offline attack.
Hardware-wrapped keys are a feature of inline encryption hardware that is
designed to protect users' data from the above attacks (to the extent possible),
without introducing limitations such as a maximum number of keys.
Note that it is impossible to **fully** protect users' data from these attacks.
Even in the attacks where the attacker "just" gets read access to kernel memory,
they can still extract any user data that is present in memory, including
plaintext pagecache pages of encrypted files. The focus here is just on
protecting the encryption keys, as those instantly give access to **all** user
data in any following offline attack, rather than just some of it (where which
data is included in that "some" might not be controlled by the attacker).
Solution overview
-----------------
Inline encryption hardware typically has "keyslots" into which software can
program keys for the hardware to use; the contents of keyslots typically can't
be read back by software. As such, the above security goals could be achieved
if the kernel simply erased its copy of the key(s) after programming them into
keyslot(s) and thereafter only referred to them via keyslot number.
However, that naive approach runs into the problem that it limits the number of
unlocked keys to the number of keyslots, which typically is a small number. In
cases where there is only one encryption key system-wide (e.g., a full-disk
encryption key), that can be tolerable. However, in general there can be many
logged-in users with many different keys, and/or many running applications with
application-specific encrypted storage areas. This is especially true if
file-based encryption (e.g. fscrypt) is being used.
Thus, it is important for the kernel to still have a way to "remind" the
hardware about a key, without actually having the raw key itself. This would
ensure that the number of hardware keyslots only limits the number of active I/O
requests, not other things such as the number of logged-in users, the number of
running apps, or the number of encrypted storage areas that apps can create.
Somewhat less importantly, it is also desirable that the raw keys are never
visible to software at all, even while being initially unlocked. This would
ensure that a read-only compromise of system memory will never allow a key to be
extracted to be used off-system, even if it occurs when a key is being unlocked.
To solve all these problems, some vendors of inline encryption hardware have
made their hardware support *hardware-wrapped keys*. Hardware-wrapped keys
are encrypted keys that can only be unwrapped (decrypted) and used by hardware
-- either by the inline encryption hardware itself, or by a dedicated hardware
block that can directly provision keys to the inline encryption hardware.
(We refer to them as "hardware-wrapped keys" rather than simply "wrapped keys"
to add some clarity in cases where there could be other types of wrapped keys,
such as in file-based encryption. Key wrapping is a commonly used technique.)
The key which wraps (encrypts) hardware-wrapped keys is a hardware-internal key
that is never exposed to software; it is either a persistent key (a "long-term
wrapping key") or a per-boot key (an "ephemeral wrapping key"). The long-term
wrapped form of the key is what is initially unlocked, but it is erased from
memory as soon as it is converted into an ephemerally-wrapped key. In-use
hardware-wrapped keys are always ephemerally-wrapped, not long-term wrapped.
As inline encryption hardware can only be used to encrypt/decrypt data on-disk,
the hardware also includes a level of indirection; it doesn't use the unwrapped
key directly for inline encryption, but rather derives both an inline encryption
key and a "software secret" from it. Software can use the "software secret" for
tasks that can't use the inline encryption hardware, such as filenames
encryption. The software secret is not protected from memory compromise.
Key hierarchy
-------------
Here is the key hierarchy for a hardware-wrapped key::
Hardware-wrapped key
|
|
<Hardware KDF>
|
-----------------------------
| |
Inline encryption key Software secret
The components are:
- *Hardware-wrapped key*: a key for the hardware's KDF (Key Derivation
Function), in ephemerally-wrapped form. The key wrapping algorithm is a
hardware implementation detail that doesn't impact kernel operation, but a
strong authenticated encryption algorithm such as AES-256-GCM is recommended.
- *Hardware KDF*: a KDF (Key Derivation Function) which the hardware uses to
derive subkeys after unwrapping the wrapped key. The hardware's choice of KDF
doesn't impact kernel operation, but it does need to be known for testing
purposes, and it's also assumed to have at least a 256-bit security strength.
All known hardware uses the SP800-108 KDF in Counter Mode with AES-256-CMAC,
with a particular choice of labels and contexts; new hardware should use this
already-vetted KDF.
- *Inline encryption key*: a derived key which the hardware directly provisions
to a keyslot of the inline encryption hardware, without exposing it to
software. In all known hardware, this will always be an AES-256-XTS key.
However, in principle other encryption algorithms could be supported too.
Hardware must derive distinct subkeys for each supported encryption algorithm.
- *Software secret*: a derived key which the hardware returns to software so
that software can use it for cryptographic tasks that can't use inline
encryption. This value is cryptographically isolated from the inline
encryption key, i.e. knowing one doesn't reveal the other. (The KDF ensures
this.) Currently, the software secret is always 32 bytes and thus is suitable
for cryptographic applications that require up to a 256-bit security strength.
Some use cases (e.g. full-disk encryption) won't require the software secret.
Example: in the case of fscrypt, the fscrypt master key (the key that protects a
particular set of encrypted directories) is made hardware-wrapped. The inline
encryption key is used as the file contents encryption key, while the software
secret (rather than the master key directly) is used to key fscrypt's KDF
(HKDF-SHA512) to derive other subkeys such as filenames encryption keys.
Note that currently this design assumes a single inline encryption key per
hardware-wrapped key, without any further key derivation. Thus, in the case of
fscrypt, currently hardware-wrapped keys are only compatible with the "inline
encryption optimized" settings, which use one file contents encryption key per
encryption policy rather than one per file. This design could be extended to
make the hardware derive per-file keys using per-file nonces passed down the
storage stack, and in fact some hardware already supports this; future work is
planned to remove this limitation by adding the corresponding kernel support.
Kernel support
--------------
The inline encryption support of the kernel's block layer ("blk-crypto") has
been extended to support hardware-wrapped keys as an alternative to standard
keys, when hardware support is available. This works in the following way:
- A ``key_types_supported`` field is added to the crypto capabilities in
``struct blk_crypto_profile``. This allows device drivers to declare that
they support standard keys, hardware-wrapped keys, or both.
- ``struct blk_crypto_key`` can now contain a hardware-wrapped key as an
alternative to a standard key; a ``key_type`` field is added to
``struct blk_crypto_config`` to distinguish between the different key types.
This allows users of blk-crypto to en/decrypt data using a hardware-wrapped
key in a way very similar to using a standard key.
- A new method ``blk_crypto_ll_ops::derive_sw_secret`` is added. Device drivers
that support hardware-wrapped keys must implement this method. Users of
blk-crypto can call ``blk_crypto_derive_sw_secret()`` to access this method.
- The programming and eviction of hardware-wrapped keys happens via
``blk_crypto_ll_ops::keyslot_program`` and
``blk_crypto_ll_ops::keyslot_evict``, just like it does for standard keys. If
a driver supports hardware-wrapped keys, then it must handle hardware-wrapped
keys being passed to these methods.
blk-crypto-fallback doesn't support hardware-wrapped keys. Therefore,
hardware-wrapped keys can only be used with actual inline encryption hardware.
Currently, the kernel only works with hardware-wrapped keys in
ephemerally-wrapped form. No generic kernel interfaces are provided for
generating or importing hardware-wrapped keys in the first place, or converting
them to ephemerally-wrapped form. In Android, SoC vendors are required to
support these operations in their KeyMint implementation (a hardware abstraction
layer in userspace); for details, see the `Android documentation
<https://source.android.com/security/encryption/hw-wrapped-keys>`_.
Testability
-----------
Both the hardware KDF and the inline encryption itself are well-defined
algorithms that don't depend on any secrets other than the unwrapped key.
Therefore, if the unwrapped key is known to software, these algorithms can be
reproduced in software in order to verify the ciphertext that is written to disk
by the inline encryption hardware.
However, the unwrapped key will only be known to software for testing if the
"import" functionality is used. Proper testing is not possible in the
"generate" case where the hardware generates the key itself. The correct
operation of the "generate" mode thus relies on the security and correctness of
the hardware RNG and its use to generate the key, as well as the testing of the
"import" mode as that should cover all parts other than the key generation.
For an example of a test that verifies the ciphertext written to disk in the
"import" mode, see the fscrypt hardware-wrapped key tests in xfstests, or
`Android's vts_kernel_encryption_test
<https://android.googlesource.com/platform/test/vts-testcase/kernel/+/refs/heads/master/encryption/>`_.

View File

@@ -4,39 +4,76 @@ The Kernel Address Sanitizer (KASAN)
Overview Overview
-------- --------
KernelAddressSANitizer (KASAN) is a dynamic memory safety error detector Kernel Address Sanitizer (KASAN) is a dynamic memory safety error detector
designed to find out-of-bound and use-after-free bugs. KASAN has three modes: designed to find out-of-bounds and use-after-free bugs.
1. generic KASAN (similar to userspace ASan), KASAN has three modes:
2. software tag-based KASAN (similar to userspace HWASan),
3. hardware tag-based KASAN (based on hardware memory tagging).
Generic KASAN is mainly used for debugging due to a large memory overhead. 1. Generic KASAN
Software tag-based KASAN can be used for dogfood testing as it has a lower 2. Software Tag-Based KASAN
memory overhead that allows using it with real workloads. Hardware tag-based 3. Hardware Tag-Based KASAN
KASAN comes with low memory and performance overheads and, therefore, can be
used in production. Either as an in-field memory bug detector or as a security
mitigation.
Software KASAN modes (#1 and #2) use compile-time instrumentation to insert Generic KASAN, enabled with CONFIG_KASAN_GENERIC, is the mode intended for
validity checks before every memory access and, therefore, require a compiler debugging, similar to userspace ASan. This mode is supported on many CPU
version that supports that. architectures, but it has significant performance and memory overheads.
Generic KASAN is supported in GCC and Clang. With GCC, it requires version Software Tag-Based KASAN or SW_TAGS KASAN, enabled with CONFIG_KASAN_SW_TAGS,
8.3.0 or later. Any supported Clang version is compatible, but detection of can be used for both debugging and dogfood testing, similar to userspace HWASan.
out-of-bounds accesses for global variables is only supported since Clang 11. This mode is only supported for arm64, but its moderate memory overhead allows
using it for testing on memory-restricted devices with real workloads.
Software tag-based KASAN mode is only supported in Clang. Hardware Tag-Based KASAN or HW_TAGS KASAN, enabled with CONFIG_KASAN_HW_TAGS,
is the mode intended to be used as an in-field memory bug detector or as a
security mitigation. This mode only works on arm64 CPUs that support MTE
(Memory Tagging Extension), but it has low memory and performance overheads and
thus can be used in production.
The hardware KASAN mode (#3) relies on hardware to perform the checks but For details about the memory and performance impact of each KASAN mode, see the
still requires a compiler version that supports memory tagging instructions. descriptions of the corresponding Kconfig options.
This mode is supported in GCC 10+ and Clang 11+.
Both software KASAN modes work with SLUB and SLAB memory allocators, The Generic and the Software Tag-Based modes are commonly referred to as the
while the hardware tag-based KASAN currently only supports SLUB. software modes. The Software Tag-Based and the Hardware Tag-Based modes are
referred to as the tag-based modes.
Currently, generic KASAN is supported for the x86_64, arm, arm64, xtensa, s390, Support
and riscv architectures, and tag-based KASAN modes are supported only for arm64. -------
Architectures
~~~~~~~~~~~~~
Generic KASAN is supported on x86_64, arm, arm64, powerpc, riscv, s390, and
xtensa, and the tag-based KASAN modes are supported only on arm64.
Compilers
~~~~~~~~~
Software KASAN modes use compile-time instrumentation to insert validity checks
before every memory access and thus require a compiler version that provides
support for that. The Hardware Tag-Based mode relies on hardware to perform
these checks but still requires a compiler version that supports the memory
tagging instructions.
Generic KASAN requires GCC version 8.3.0 or later
or any Clang version supported by the kernel.
Software Tag-Based KASAN requires GCC 11+
or any Clang version supported by the kernel.
Hardware Tag-Based KASAN requires GCC 10+ or Clang 12+.
Memory types
~~~~~~~~~~~~
Generic KASAN supports finding bugs in all of slab, page_alloc, vmap, vmalloc,
stack, and global memory.
Software Tag-Based KASAN supports slab, page_alloc, vmalloc, and stack memory.
Hardware Tag-Based KASAN supports slab, page_alloc, and non-executable vmalloc
memory.
For slab, both software KASAN modes support SLUB and SLAB allocators, while
Hardware Tag-Based KASAN only supports SLUB.
Usage Usage
----- -----
@@ -45,18 +82,81 @@ To enable KASAN, configure the kernel with::
CONFIG_KASAN=y CONFIG_KASAN=y
and choose between ``CONFIG_KASAN_GENERIC`` (to enable generic KASAN), and choose between ``CONFIG_KASAN_GENERIC`` (to enable Generic KASAN),
``CONFIG_KASAN_SW_TAGS`` (to enable software tag-based KASAN), and ``CONFIG_KASAN_SW_TAGS`` (to enable Software Tag-Based KASAN), and
``CONFIG_KASAN_HW_TAGS`` (to enable hardware tag-based KASAN). ``CONFIG_KASAN_HW_TAGS`` (to enable Hardware Tag-Based KASAN).
For software modes, also choose between ``CONFIG_KASAN_OUTLINE`` and For the software modes, also choose between ``CONFIG_KASAN_OUTLINE`` and
``CONFIG_KASAN_INLINE``. Outline and inline are compiler instrumentation types. ``CONFIG_KASAN_INLINE``. Outline and inline are compiler instrumentation types.
The former produces a smaller binary while the latter is 1.1-2 times faster. The former produces a smaller binary while the latter is up to 2 times faster.
To include alloc and free stack traces of affected slab objects into reports, To include alloc and free stack traces of affected slab objects into reports,
enable ``CONFIG_STACKTRACE``. To include alloc and free stack traces of affected enable ``CONFIG_STACKTRACE``. To include alloc and free stack traces of affected
physical pages, enable ``CONFIG_PAGE_OWNER`` and boot with ``page_owner=on``. physical pages, enable ``CONFIG_PAGE_OWNER`` and boot with ``page_owner=on``.
Boot parameters
~~~~~~~~~~~~~~~
KASAN is affected by the generic ``panic_on_warn`` command line parameter.
When it is enabled, KASAN panics the kernel after printing a bug report.
By default, KASAN prints a bug report only for the first invalid memory access.
With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This
effectively disables ``panic_on_warn`` for KASAN reports.
Alternatively, independent of ``panic_on_warn``, the ``kasan.fault=`` boot
parameter can be used to control panic and reporting behaviour:
- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
report or also panic the kernel (default: ``report``). The panic happens even
if ``kasan_multi_shot`` is enabled.
Software and Hardware Tag-Based KASAN modes (see the section about various
modes below) support altering stack trace collection behavior:
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
traces collection (default: ``on``).
- ``kasan.stack_ring_size=<number of entries>`` specifies the number of entries
in the stack ring (default: ``32768``).
Hardware Tag-Based KASAN mode is intended for use in production as a security
mitigation. Therefore, it supports additional boot parameters that allow
disabling KASAN altogether or controlling its features:
- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
- ``kasan.mode=sync``, ``=async`` or ``=asymm`` controls whether KASAN
is configured in synchronous, asynchronous or asymmetric mode of
execution (default: ``sync``).
Synchronous mode: a bad access is detected immediately when a tag
check fault occurs.
Asynchronous mode: a bad access detection is delayed. When a tag check
fault occurs, the information is stored in hardware (in the TFSR_EL1
register for arm64). The kernel periodically checks the hardware and
only reports tag faults during these checks.
Asymmetric mode: a bad access is detected synchronously on reads and
asynchronously on writes.
- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
allocations (default: ``on``).
- ``kasan.page_alloc.sample=<sampling interval>`` makes KASAN tag only every
Nth page_alloc allocation with the order equal or greater than
``kasan.page_alloc.sample.order``, where N is the value of the ``sample``
parameter (default: ``1``, or tag every such allocation).
This parameter is intended to mitigate the performance overhead introduced
by KASAN.
Note that enabling this parameter makes Hardware Tag-Based KASAN skip checks
of allocations chosen by sampling and thus miss bad accesses to these
allocations. Use the default value for accurate bug detection.
- ``kasan.page_alloc.sample.order=<minimum page order>`` specifies the minimum
order of allocations that are affected by sampling (default: ``3``).
Only applies when ``kasan.page_alloc.sample`` is set to a value greater
than ``1``.
This parameter is intended to allow sampling only large page_alloc
allocations, which is the biggest source of the performance overhead.
Error reports Error reports
~~~~~~~~~~~~~ ~~~~~~~~~~~~~
@@ -146,7 +246,7 @@ is either 8 or 16 aligned bytes depending on KASAN mode. Each number in the
memory state section of the report shows the state of one of the memory memory state section of the report shows the state of one of the memory
granules that surround the accessed address. granules that surround the accessed address.
For generic KASAN, the size of each memory granule is 8. The state of each For Generic KASAN, the size of each memory granule is 8. The state of each
granule is encoded in one shadow byte. Those 8 bytes can be accessible, granule is encoded in one shadow byte. Those 8 bytes can be accessible,
partially accessible, freed, or be a part of a redzone. KASAN uses the following partially accessible, freed, or be a part of a redzone. KASAN uses the following
encoding for each shadow byte: 00 means that all 8 bytes of the corresponding encoding for each shadow byte: 00 means that all 8 bytes of the corresponding
@@ -171,41 +271,6 @@ traces point to places in code that interacted with the object but that are not
directly present in the bad access stack trace. Currently, this includes directly present in the bad access stack trace. Currently, this includes
call_rcu() and workqueue queuing. call_rcu() and workqueue queuing.
Boot parameters
~~~~~~~~~~~~~~~
KASAN is affected by the generic ``panic_on_warn`` command line parameter.
When it is enabled, KASAN panics the kernel after printing a bug report.
By default, KASAN prints a bug report only for the first invalid memory access.
With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This
effectively disables ``panic_on_warn`` for KASAN reports.
Alternatively, independent of ``panic_on_warn`` the ``kasan.fault=`` boot
parameter can be used to control panic and reporting behaviour:
- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
report or also panic the kernel (default: ``report``). The panic happens even
if ``kasan_multi_shot`` is enabled.
Hardware tag-based KASAN mode (see the section about various modes below) is
intended for use in production as a security mitigation. Therefore, it supports
additional boot parameters that allow disabling KASAN or controlling features:
- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
- ``kasan.mode=sync`` or ``=async`` controls whether KASAN is configured in
synchronous or asynchronous mode of execution (default: ``sync``).
Synchronous mode: a bad access is detected immediately when a tag
check fault occurs.
Asynchronous mode: a bad access detection is delayed. When a tag check
fault occurs, the information is stored in hardware (in the TFSR_EL1
register for arm64). The kernel periodically checks the hardware and
only reports tag faults during these checks.
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
traces collection (default: ``on``).
Implementation details Implementation details
---------------------- ----------------------
@@ -244,49 +309,46 @@ outline-instrumented kernel.
Generic KASAN is the only mode that delays the reuse of freed objects via Generic KASAN is the only mode that delays the reuse of freed objects via
quarantine (see mm/kasan/quarantine.c for implementation). quarantine (see mm/kasan/quarantine.c for implementation).
Software tag-based KASAN Software Tag-Based KASAN
~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~
Software tag-based KASAN uses a software memory tagging approach to checking Software Tag-Based KASAN uses a software memory tagging approach to checking
access validity. It is currently only implemented for the arm64 architecture. access validity. It is currently only implemented for the arm64 architecture.
Software tag-based KASAN uses the Top Byte Ignore (TBI) feature of arm64 CPUs Software Tag-Based KASAN uses the Top Byte Ignore (TBI) feature of arm64 CPUs
to store a pointer tag in the top byte of kernel pointers. It uses shadow memory to store a pointer tag in the top byte of kernel pointers. It uses shadow memory
to store memory tags associated with each 16-byte memory cell (therefore, it to store memory tags associated with each 16-byte memory cell (therefore, it
dedicates 1/16th of the kernel memory for shadow memory). dedicates 1/16th of the kernel memory for shadow memory).
On each memory allocation, software tag-based KASAN generates a random tag, tags On each memory allocation, Software Tag-Based KASAN generates a random tag, tags
the allocated memory with this tag, and embeds the same tag into the returned the allocated memory with this tag, and embeds the same tag into the returned
pointer. pointer.
Software tag-based KASAN uses compile-time instrumentation to insert checks Software Tag-Based KASAN uses compile-time instrumentation to insert checks
before each memory access. These checks make sure that the tag of the memory before each memory access. These checks make sure that the tag of the memory
that is being accessed is equal to the tag of the pointer that is used to access that is being accessed is equal to the tag of the pointer that is used to access
this memory. In case of a tag mismatch, software tag-based KASAN prints a bug this memory. In case of a tag mismatch, Software Tag-Based KASAN prints a bug
report. report.
Software tag-based KASAN also has two instrumentation modes (outline, which Software Tag-Based KASAN also has two instrumentation modes (outline, which
emits callbacks to check memory accesses; and inline, which performs the shadow emits callbacks to check memory accesses; and inline, which performs the shadow
memory checks inline). With outline instrumentation mode, a bug report is memory checks inline). With outline instrumentation mode, a bug report is
printed from the function that performs the access check. With inline printed from the function that performs the access check. With inline
instrumentation, a ``brk`` instruction is emitted by the compiler, and a instrumentation, a ``brk`` instruction is emitted by the compiler, and a
dedicated ``brk`` handler is used to print bug reports. dedicated ``brk`` handler is used to print bug reports.
Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through Software Tag-Based KASAN uses 0xFF as a match-all pointer tag (accesses through
pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently
reserved to tag freed memory regions. reserved to tag freed memory regions.
Software tag-based KASAN currently only supports tagging of slab and page_alloc Hardware Tag-Based KASAN
memory.
Hardware tag-based KASAN
~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~
Hardware tag-based KASAN is similar to the software mode in concept but uses Hardware Tag-Based KASAN is similar to the software mode in concept but uses
hardware memory tagging support instead of compiler instrumentation and hardware memory tagging support instead of compiler instrumentation and
shadow memory. shadow memory.
Hardware tag-based KASAN is currently only implemented for arm64 architecture Hardware Tag-Based KASAN is currently only implemented for arm64 architecture
and based on both arm64 Memory Tagging Extension (MTE) introduced in ARMv8.5 and based on both arm64 Memory Tagging Extension (MTE) introduced in ARMv8.5
Instruction Set Architecture and Top Byte Ignore (TBI). Instruction Set Architecture and Top Byte Ignore (TBI).
@@ -296,26 +358,25 @@ access, hardware makes sure that the tag of the memory that is being accessed is
equal to the tag of the pointer that is used to access this memory. In case of a equal to the tag of the pointer that is used to access this memory. In case of a
tag mismatch, a fault is generated, and a report is printed. tag mismatch, a fault is generated, and a report is printed.
Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through Hardware Tag-Based KASAN uses 0xFF as a match-all pointer tag (accesses through
pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently
reserved to tag freed memory regions. reserved to tag freed memory regions.
Hardware tag-based KASAN currently only supports tagging of slab and page_alloc If the hardware does not support MTE (pre ARMv8.5), Hardware Tag-Based KASAN
memory.
If the hardware does not support MTE (pre ARMv8.5), hardware tag-based KASAN
will not be enabled. In this case, all KASAN boot parameters are ignored. will not be enabled. In this case, all KASAN boot parameters are ignored.
Note that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI being Note that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI being
enabled. Even when ``kasan.mode=off`` is provided or when the hardware does not enabled. Even when ``kasan.mode=off`` is provided or when the hardware does not
support MTE (but supports TBI). support MTE (but supports TBI).
Hardware tag-based KASAN only reports the first found bug. After that, MTE tag Hardware Tag-Based KASAN only reports the first found bug. After that, MTE tag
checking gets disabled. checking gets disabled.
Shadow memory Shadow memory
------------- -------------
The contents of this section are only applicable to software KASAN modes.
The kernel maps memory in several different parts of the address space. The kernel maps memory in several different parts of the address space.
The range of kernel virtual addresses is large: there is not enough real The range of kernel virtual addresses is large: there is not enough real
memory to support a real shadow region for every address that could be memory to support a real shadow region for every address that could be
@@ -346,7 +407,7 @@ CONFIG_KASAN_VMALLOC
With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the
cost of greater memory usage. Currently, this is supported on x86, cost of greater memory usage. Currently, this is supported on x86,
riscv, s390, and powerpc. arm64, riscv, s390, and powerpc.
This works by hooking into vmalloc and vmap and dynamically This works by hooking into vmalloc and vmap and dynamically
allocating real shadow memory to back the mappings. allocating real shadow memory to back the mappings.
@@ -406,19 +467,18 @@ generic ``noinstr`` one.
Note that disabling compiler instrumentation (either on a per-file or a Note that disabling compiler instrumentation (either on a per-file or a
per-function basis) makes KASAN ignore the accesses that happen directly in per-function basis) makes KASAN ignore the accesses that happen directly in
that code for software KASAN modes. It does not help when the accesses happen that code for software KASAN modes. It does not help when the accesses happen
indirectly (through calls to instrumented functions) or with the hardware indirectly (through calls to instrumented functions) or with Hardware
tag-based mode that does not use compiler instrumentation. Tag-Based KASAN, which does not use compiler instrumentation.
For software KASAN modes, to disable KASAN reports in a part of the kernel code For software KASAN modes, to disable KASAN reports in a part of the kernel code
for the current task, annotate this part of the code with a for the current task, annotate this part of the code with a
``kasan_disable_current()``/``kasan_enable_current()`` section. This also ``kasan_disable_current()``/``kasan_enable_current()`` section. This also
disables the reports for indirect accesses that happen through function calls. disables the reports for indirect accesses that happen through function calls.
For tag-based KASAN modes (include the hardware one), to disable access For tag-based KASAN modes, to disable access checking, use
checking, use ``kasan_reset_tag()`` or ``page_kasan_tag_reset()``. Note that ``kasan_reset_tag()`` or ``page_kasan_tag_reset()``. Note that temporarily
temporarily disabling access checking via ``page_kasan_tag_reset()`` requires disabling access checking via ``page_kasan_tag_reset()`` requires saving and
saving and restoring the per-page KASAN tag via restoring the per-page KASAN tag via ``page_kasan_tag``/``page_kasan_tag_set``.
``page_kasan_tag``/``page_kasan_tag_set``.
Tests Tests
~~~~~ ~~~~~

View File

@@ -0,0 +1,99 @@
dm_bow (backup on write)
========================
dm_bow is a device mapper driver that uses the free space on a device to back up
data that is overwritten. The changes can then be committed by a simple state
change, or rolled back by removing the dm_bow device and running a command line
utility over the underlying device.
dm_bow has three states, set by writing 1 or 2 to /sys/block/dm-?/bow/state.
It is only possible to go from state 0 (initial state) to state 1, and then from
state 1 to state 2.
State 0: dm_bow collects all trims to the device and assumes that these mark
free space on the overlying file system that can be safely used. Typically the
mount code would create the dm_bow device, mount the file system, call the
FITRIM ioctl on the file system then switch to state 1. These trims are not
propagated to the underlying device.
State 1: All writes to the device cause the underlying data to be backed up to
the free (trimmed) area as needed in such a way as they can be restored.
However, the writes, with one exception, then happen exactly as they would
without dm_bow, so the device is always in a good final state. The exception is
that sector 0 is used to keep a log of the latest changes, both to indicate that
we are in this state and to allow rollback. See below for all details. If there
isn't enough free space, writes are failed with -ENOSPC.
State 2: The transition to state 2 triggers replacing the special sector 0 with
the normal sector 0, and the freeing of all state information. dm_bow then
becomes a pass-through driver, allowing the device to continue to be used with
minimal performance impact.
Usage
=====
dm-bow takes one command line parameter, the name of the underlying device.
dm-bow will typically be used in the following way. dm-bow will be loaded with a
suitable underlying device and the resultant device will be mounted. A file
system trim will be issued via the FITRIM ioctl, then the device will be
switched to state 1. The file system will now be used as normal. At some point,
the changes can either be committed by switching to state 2, or rolled back by
unmounting the file system, removing the dm-bow device and running the command
line utility. Note that rebooting the device will be equivalent to unmounting
and removing, but the command line utility must still be run
Details of operation in state 1
===============================
dm_bow maintains a type for all sectors. A sector can be any of:
SECTOR0
SECTOR0_CURRENT
UNCHANGED
FREE
CHANGED
BACKUP
SECTOR0 is the first sector on the device, and is used to hold the log of
changes. This is the one exception.
SECTOR0_CURRENT is a sector picked from the FREE sectors, and is where reads and
writes from the true sector zero are redirected to. Note that like any backup
sector, if the sector is written to directly, it must be moved again.
UNCHANGED means that the sector has not been changed since we entered state 1.
Thus if it is written to or trimmed, the contents must first be backed up.
FREE means that the sector was trimmed in state 0 and has not yet been written
to or used for backup. On being written to, a FREE sector is changed to CHANGED.
CHANGED means that the sector has been modified, and can be further modified
without further backup.
BACKUP means that this is a free sector being used as a backup. On being written
to, the contents must first be backed up again.
All backup operations are logged to the first sector. The log sector has the
format:
--------------------------------------------------------
| Magic | Count | Sequence | Log entry | Log entry | …
--------------------------------------------------------
Magic is a magic number. Count is the number of log entries. Sequence is 0
initially. A log entry is
-----------------------------------
| Source | Dest | Size | Checksum |
-----------------------------------
When SECTOR0 is full, the log sector is backed up and another empty log sector
created with sequence number one higher. The first entry in any log entry with
sequence > 0 therefore must be the log of the backing up of the previous log
sector. Note that sequence is not strictly needed, but is a useful sanity check
and potentially limits the time spent trying to restore a corrupted snapshot.
On entering state 1, dm_bow has a list of free sectors. All other sectors are
unchanged. Sector0_current is selected from the free sectors and the contents of
sector 0 are copied there. The sector 0 is backed up, which triggers the first
log entry to be written.

View File

@@ -0,0 +1,51 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/misc/qemu,vcpu-stall-detector.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: VCPU stall detector
description:
This binding describes a CPU stall detector mechanism for virtual CPUs
which is accessed through MMIO.
maintainers:
- Sebastian Ene <sebastianene@google.com>
properties:
compatible:
enum:
- qemu,vcpu-stall-detector
reg:
maxItems: 1
clock-frequency:
$ref: /schemas/types.yaml#/definitions/uint32
description: |
The internal clock of the stall detector peripheral measure in Hz used
to decrement its internal counter register on each tick.
Defaults to 10 if unset.
default: 10
timeout-sec:
description: |
The stall detector expiration timeout measured in seconds.
Defaults to 8 if unset. Please note that it also takes into account the
time spent while the VCPU is not running.
default: 8
required:
- compatible
additionalProperties: false
examples:
- |
vmwdt@9030000 {
compatible = "qemu,vcpu-stall-detector";
reg = <0x9030000 0x10000>;
clock-frequency = <10>;
timeout-sec = <8>;
};

View File

@@ -119,6 +119,18 @@ properties:
If present, HS400 command responses are sampled on rising edges. If present, HS400 command responses are sampled on rising edges.
If not present, HS400 command responses are sampled on falling edges. If not present, HS400 command responses are sampled on falling edges.
mediatek,hs400-ds-dly3:
$ref: /schemas/types.yaml#/definitions/uint32
description:
Gear of the third delay line for DS for input data latch in data
pad macro, there are 32 stages from 0 to 31.
For different corner IC, the time is different about one step, it is
about 100ps.
The value is confirmed by doing scan and calibration to find a best
value with corner IC and it is valid only for HS400 mode.
minimum: 0
maximum: 31
mediatek,latch-ck: mediatek,latch-ck:
$ref: /schemas/types.yaml#/definitions/uint32 $ref: /schemas/types.yaml#/definitions/uint32
description: description:

View File

@@ -12,12 +12,14 @@ maintainers:
properties: properties:
compatible: compatible:
const: arm,cmn-600 enum:
- arm,cmn-600
- arm,ci-700
reg: reg:
items: items:
- description: Physical address of the base (PERIPHBASE) and - description: Physical address of the base (PERIPHBASE) and
size (up to 64MB) of the configuration address space. size of the configuration address space.
interrupts: interrupts:
minItems: 1 minItems: 1
@@ -31,13 +33,22 @@ properties:
arm,root-node: arm,root-node:
$ref: /schemas/types.yaml#/definitions/uint32 $ref: /schemas/types.yaml#/definitions/uint32
description: Offset from PERIPHBASE of the configuration description: Offset from PERIPHBASE of CMN-600's configuration
discovery node (see TRM definition of ROOTNODEBASE). discovery node (see TRM definition of ROOTNODEBASE). Not
relevant for newer CMN/CI products.
required: required:
- compatible - compatible
- reg - reg
- interrupts - interrupts
if:
properties:
compatible:
contains:
const: arm,cmn-600
then:
required:
- arm,root-node - arm,root-node
additionalProperties: false additionalProperties: false

View File

@@ -0,0 +1,46 @@
# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
%YAML 1.2
---
$id: http://devicetree.org/schemas/reserved-memory/google,open-dice.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Open Profile for DICE Device Tree Bindings
description: |
This binding represents a reserved memory region containing data
generated by the Open Profile for DICE protocol.
See https://pigweed.googlesource.com/open-dice/
maintainers:
- David Brazdil <dbrazdil@google.com>
allOf:
- $ref: "reserved-memory.yaml"
properties:
compatible:
const: google,open-dice
reg:
description: page-aligned region of memory containing DICE data
required:
- compatible
- reg
- no-map
unevaluatedProperties: false
examples:
- |
reserved-memory {
#address-cells = <2>;
#size-cells = <1>;
dice: dice@12340000 {
compatible = "google,open-dice";
reg = <0x00 0x12340000 0x2000>;
no-map;
};
};

View File

@@ -0,0 +1 @@
per-file f2fs**=file:/fs/f2fs/OWNERS

View File

@@ -19,9 +19,10 @@ It is designed as a better filesystem solution for the following scenarios:
immutable and bit-for-bit identical to the official golden image for immutable and bit-for-bit identical to the official golden image for
their releases due to security and other considerations and their releases due to security and other considerations and
- hope to save some extra storage space with guaranteed end-to-end performance - hope to minimize extra storage space with guaranteed end-to-end performance
by using reduced metadata and transparent file compression, especially by using compact layout, transparent file compression and direct access,
for those embedded devices with limited memory (ex, smartphone); especially for those embedded devices with limited memory and high-density
hosts with numerous containers;
Here is the main features of EROFS: Here is the main features of EROFS:
@@ -51,7 +52,9 @@ Here is the main features of EROFS:
- Support POSIX.1e ACLs by using xattrs; - Support POSIX.1e ACLs by using xattrs;
- Support transparent data compression as an option: - Support transparent data compression as an option:
LZ4 algorithm with the fixed-sized output compression for high performance. LZ4 algorithm with the fixed-sized output compression for high performance;
- Multiple device support for multi-layer container images.
The following git tree provides the file system user-space tools under The following git tree provides the file system user-space tools under
development (ex, formatting tool mkfs.erofs): development (ex, formatting tool mkfs.erofs):
@@ -87,8 +90,17 @@ cache_strategy=%s Select a strategy for cached decompression from now on:
dax={always,never} Use direct access (no page cache). See dax={always,never} Use direct access (no page cache). See
Documentation/filesystems/dax.rst. Documentation/filesystems/dax.rst.
dax A legacy option which is an alias for ``dax=always``. dax A legacy option which is an alias for ``dax=always``.
device=%s Specify a path to an extra device to be used together.
=================== ========================================================= =================== =========================================================
Sysfs Entries
=============
Information about mounted erofs file systems can be found in /sys/fs/erofs.
Each mounted filesystem will have a directory in /sys/fs/erofs based on its
device name (i.e., /sys/fs/erofs/sda).
(see also Documentation/ABI/testing/sysfs-fs-erofs)
On-disk details On-disk details
=============== ===============

View File

@@ -25,10 +25,14 @@ a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs).
- git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git - git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git
For reporting bugs and sending patches, please use the following mailing list: For sending patches, please use the following mailing list:
- linux-f2fs-devel@lists.sourceforge.net - linux-f2fs-devel@lists.sourceforge.net
For reporting bugs, please use the following f2fs bug tracker link:
- https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs
Background and Design issues Background and Design issues
============================ ============================
@@ -154,6 +158,8 @@ nobarrier This option can be used if underlying storage guarantees
If this option is set, no cache_flush commands are issued If this option is set, no cache_flush commands are issued
but f2fs still guarantees the write ordering of all the but f2fs still guarantees the write ordering of all the
data writes. data writes.
barrier If this option is set, cache_flush commands are allowed to be
issued.
fastboot This option is used when a system wants to reduce mount fastboot This option is used when a system wants to reduce mount
time as much as possible, even though normal performance time as much as possible, even though normal performance
can be sacrificed. can be sacrificed.
@@ -198,10 +204,30 @@ fault_type=%d Support configuring fault injection type, should be
FAULT_WRITE_IO 0x000004000 FAULT_WRITE_IO 0x000004000
FAULT_SLAB_ALLOC 0x000008000 FAULT_SLAB_ALLOC 0x000008000
FAULT_DQUOT_INIT 0x000010000 FAULT_DQUOT_INIT 0x000010000
FAULT_LOCK_OP 0x000020000
FAULT_BLKADDR 0x000040000
=================== =========== =================== ===========
mode=%s Control block allocation mode which supports "adaptive" mode=%s Control block allocation mode which supports "adaptive"
and "lfs". In "lfs" mode, there should be no random and "lfs". In "lfs" mode, there should be no random
writes towards main area. writes towards main area.
"fragment:segment" and "fragment:block" are newly added here.
These are developer options for experiments to simulate filesystem
fragmentation/after-GC situation itself. The developers use these
modes to understand filesystem fragmentation/after-GC condition well,
and eventually get some insights to handle them better.
In "fragment:segment", f2fs allocates a new segment in ramdom
position. With this, we can simulate the after-GC condition.
In "fragment:block", we can scatter block allocation with
"max_fragment_chunk" and "max_fragment_hole" sysfs nodes.
We added some randomness to both chunk and hole size to make
it close to realistic IO pattern. So, in this mode, f2fs will allocate
1..<max_fragment_chunk> blocks in a chunk and make a hole in the
length of 1..<max_fragment_hole> by turns. With this, the newly
allocated blocks will be scattered throughout the whole partition.
Note that "fragment:block" implicitly enables "fragment:segment"
option for more randomness.
Please, use these options for your experiments and we strongly
recommend to re-format the filesystem after using these options.
io_bits=%u Set the bit size of write IO requests. It should be set io_bits=%u Set the bit size of write IO requests. It should be set
with "mode=lfs". with "mode=lfs".
usrquota Enable plain user disk quota accounting. usrquota Enable plain user disk quota accounting.
@@ -216,12 +242,6 @@ offgrpjquota Turn off group journalled quota.
offprjjquota Turn off project journalled quota. offprjjquota Turn off project journalled quota.
quota Enable plain user disk quota accounting. quota Enable plain user disk quota accounting.
noquota Disable all plain disk quota option. noquota Disable all plain disk quota option.
whint_mode=%s Control which write hints are passed down to block
layer. This supports "off", "user-based", and
"fs-based". In "off" mode (default), f2fs does not pass
down hints. In "user-based" mode, f2fs tries to pass
down hints given by users. And in "fs-based" mode, f2fs
passes down hints with its policy.
alloc_mode=%s Adjust block allocation policy, which supports "reuse" alloc_mode=%s Adjust block allocation policy, which supports "reuse"
and "default". and "default".
fsync_mode=%s Control the policy of fsync. Currently supports "posix", fsync_mode=%s Control the policy of fsync. Currently supports "posix",
@@ -323,6 +343,15 @@ discard_unit=%s Control discard unit, the argument can be "block", "segment"
default, it is helpful for large sized SMR or ZNS devices to default, it is helpful for large sized SMR or ZNS devices to
reduce memory cost by getting rid of fs metadata supports small reduce memory cost by getting rid of fs metadata supports small
discard. discard.
memory=%s Control memory mode. This supports "normal" and "low" modes.
"low" mode is introduced to support low memory devices.
Because of the nature of low memory devices, in this mode, f2fs
will try to save memory sometimes by sacrificing performance.
"normal" mode is the default mode and same as before.
age_extent_cache Enable an age extent cache based on rb-tree. It records
data block update frequency of the extent per inode, in
order to provide better temperature hints for data block
allocation.
======================== ============================================================ ======================== ============================================================
Debugfs Entries Debugfs Entries
@@ -732,70 +761,6 @@ In order to identify whether the data in the victim segment are valid or not,
F2FS manages a bitmap. Each bit represents the validity of a block, and the F2FS manages a bitmap. Each bit represents the validity of a block, and the
bitmap is composed of a bit stream covering whole blocks in main area. bitmap is composed of a bit stream covering whole blocks in main area.
Write-hint Policy
-----------------
1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET.
2) whint_mode=user-based. F2FS tries to pass down hints given by
users.
===================== ======================== ===================
User F2FS Block
===================== ======================== ===================
N/A META WRITE_LIFE_NOT_SET
N/A HOT_NODE "
N/A WARM_NODE "
N/A COLD_NODE "
ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
extension list " "
-- buffered io
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
WRITE_LIFE_NONE " "
WRITE_LIFE_MEDIUM " "
WRITE_LIFE_LONG " "
-- direct io
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
WRITE_LIFE_NONE " WRITE_LIFE_NONE
WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
WRITE_LIFE_LONG " WRITE_LIFE_LONG
===================== ======================== ===================
3) whint_mode=fs-based. F2FS passes down hints with its policy.
===================== ======================== ===================
User F2FS Block
===================== ======================== ===================
N/A META WRITE_LIFE_MEDIUM;
N/A HOT_NODE WRITE_LIFE_NOT_SET
N/A WARM_NODE "
N/A COLD_NODE WRITE_LIFE_NONE
ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME
extension list " "
-- buffered io
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG
WRITE_LIFE_NONE " "
WRITE_LIFE_MEDIUM " "
WRITE_LIFE_LONG " "
-- direct io
WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME
WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT
WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
WRITE_LIFE_NONE " WRITE_LIFE_NONE
WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
WRITE_LIFE_LONG " WRITE_LIFE_LONG
===================== ======================== ===================
Fallocate(2) Policy Fallocate(2) Policy
------------------- -------------------

View File

@@ -77,11 +77,11 @@ Side-channel attacks
fscrypt is only resistant to side-channel attacks, such as timing or fscrypt is only resistant to side-channel attacks, such as timing or
electromagnetic attacks, to the extent that the underlying Linux electromagnetic attacks, to the extent that the underlying Linux
Cryptographic API algorithms are. If a vulnerable algorithm is used, Cryptographic API algorithms or inline encryption hardware are. If a
such as a table-based implementation of AES, it may be possible for an vulnerable algorithm is used, such as a table-based implementation of
attacker to mount a side channel attack against the online system. AES, it may be possible for an attacker to mount a side channel attack
Side channel attacks may also be mounted against applications against the online system. Side channel attacks may also be mounted
consuming decrypted data. against applications consuming decrypted data.
Unauthorized file access Unauthorized file access
~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -337,6 +337,8 @@ Currently, the following pairs of encryption modes are supported:
- AES-256-XTS for contents and AES-256-CTS-CBC for filenames - AES-256-XTS for contents and AES-256-CTS-CBC for filenames
- AES-128-CBC for contents and AES-128-CTS-CBC for filenames - AES-128-CBC for contents and AES-128-CTS-CBC for filenames
- Adiantum for both contents and filenames - Adiantum for both contents and filenames
- AES-256-XTS for contents and AES-256-HCTR2 for filenames (v2 policies only)
- SM4-XTS for contents and SM4-CTS-CBC for filenames (v2 policies only)
If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair. If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
@@ -357,6 +359,23 @@ To use Adiantum, CONFIG_CRYPTO_ADIANTUM must be enabled. Also, fast
implementations of ChaCha and NHPoly1305 should be enabled, e.g. implementations of ChaCha and NHPoly1305 should be enabled, e.g.
CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM. CONFIG_CRYPTO_CHACHA20_NEON and CONFIG_CRYPTO_NHPOLY1305_NEON for ARM.
AES-256-HCTR2 is another true wide-block encryption mode that is intended for
use on CPUs with dedicated crypto instructions. AES-256-HCTR2 has the property
that a bitflip in the plaintext changes the entire ciphertext. This property
makes it desirable for filename encryption since initialization vectors are
reused within a directory. For more details on AES-256-HCTR2, see the paper
"Length-preserving encryption with HCTR2"
(https://eprint.iacr.org/2021/1441.pdf). To use AES-256-HCTR2,
CONFIG_CRYPTO_HCTR2 must be enabled. Also, fast implementations of XCTR and
POLYVAL should be enabled, e.g. CRYPTO_POLYVAL_ARM64_CE and
CRYPTO_AES_ARM64_CE_BLK for ARM64.
SM4 is a Chinese block cipher that is an alternative to AES. It has
not seen as much security review as AES, and it only has a 128-bit key
size. It may be useful in cases where its use is mandated.
Otherwise, it should not be used. For SM4 support to be available, it
also needs to be enabled in the kernel crypto API.
New encryption modes can be added relatively easily, without changes New encryption modes can be added relatively easily, without changes
to individual filesystems. However, authenticated encryption (AE) to individual filesystems. However, authenticated encryption (AE)
modes are not currently supported because of the difficulty of dealing modes are not currently supported because of the difficulty of dealing
@@ -404,11 +423,11 @@ alternatively has the file's nonce (for `DIRECT_KEY policies`_) or
inode number (for `IV_INO_LBLK_64 policies`_) included in the IVs. inode number (for `IV_INO_LBLK_64 policies`_) included in the IVs.
Thus, IV reuse is limited to within a single directory. Thus, IV reuse is limited to within a single directory.
With CTS-CBC, the IV reuse means that when the plaintext filenames With CTS-CBC, the IV reuse means that when the plaintext filenames share a
share a common prefix at least as long as the cipher block size (16 common prefix at least as long as the cipher block size (16 bytes for AES), the
bytes for AES), the corresponding encrypted filenames will also share corresponding encrypted filenames will also share a common prefix. This is
a common prefix. This is undesirable. Adiantum does not have this undesirable. Adiantum and HCTR2 do not have this weakness, as they are
weakness, as it is a wide-block encryption mode. wide-block encryption modes.
All supported filenames encryption modes accept any plaintext length All supported filenames encryption modes accept any plaintext length
>= 16 bytes; cipher block alignment is not required. However, >= 16 bytes; cipher block alignment is not required. However,
@@ -1047,8 +1066,8 @@ astute users may notice some differences in behavior:
may be used to overwrite the source files but isn't guaranteed to be may be used to overwrite the source files but isn't guaranteed to be
effective on all filesystems and storage devices. effective on all filesystems and storage devices.
- Direct I/O is not supported on encrypted files. Attempts to use - Direct I/O is supported on encrypted files only under some
direct I/O on such files will fall back to buffered I/O. circumstances. For details, see `Direct I/O support`_.
- The fallocate operations FALLOC_FL_COLLAPSE_RANGE and - The fallocate operations FALLOC_FL_COLLAPSE_RANGE and
FALLOC_FL_INSERT_RANGE are not supported on encrypted files and will FALLOC_FL_INSERT_RANGE are not supported on encrypted files and will
@@ -1135,6 +1154,71 @@ where applications may later write sensitive data. It is recommended
that systems implementing a form of "verified boot" take advantage of that systems implementing a form of "verified boot" take advantage of
this by validating all top-level encryption policies prior to access. this by validating all top-level encryption policies prior to access.
Inline encryption support
=========================
By default, fscrypt uses the kernel crypto API for all cryptographic
operations (other than HKDF, which fscrypt partially implements
itself). The kernel crypto API supports hardware crypto accelerators,
but only ones that work in the traditional way where all inputs and
outputs (e.g. plaintexts and ciphertexts) are in memory. fscrypt can
take advantage of such hardware, but the traditional acceleration
model isn't particularly efficient and fscrypt hasn't been optimized
for it.
Instead, many newer systems (especially mobile SoCs) have *inline
encryption hardware* that can encrypt/decrypt data while it is on its
way to/from the storage device. Linux supports inline encryption
through a set of extensions to the block layer called *blk-crypto*.
blk-crypto allows filesystems to attach encryption contexts to bios
(I/O requests) to specify how the data will be encrypted or decrypted
in-line. For more information about blk-crypto, see
:ref:`Documentation/block/inline-encryption.rst <inline_encryption>`.
On supported filesystems (currently ext4 and f2fs), fscrypt can use
blk-crypto instead of the kernel crypto API to encrypt/decrypt file
contents. To enable this, set CONFIG_FS_ENCRYPTION_INLINE_CRYPT=y in
the kernel configuration, and specify the "inlinecrypt" mount option
when mounting the filesystem.
Note that the "inlinecrypt" mount option just specifies to use inline
encryption when possible; it doesn't force its use. fscrypt will
still fall back to using the kernel crypto API on files where the
inline encryption hardware doesn't have the needed crypto capabilities
(e.g. support for the needed encryption algorithm and data unit size)
and where blk-crypto-fallback is unusable. (For blk-crypto-fallback
to be usable, it must be enabled in the kernel configuration with
CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y.)
Currently fscrypt always uses the filesystem block size (which is
usually 4096 bytes) as the data unit size. Therefore, it can only use
inline encryption hardware that supports that data unit size.
Inline encryption doesn't affect the ciphertext or other aspects of
the on-disk format, so users may freely switch back and forth between
using "inlinecrypt" and not using "inlinecrypt".
Direct I/O support
==================
For direct I/O on an encrypted file to work, the following conditions
must be met (in addition to the conditions for direct I/O on an
unencrypted file):
* The file must be using inline encryption. Usually this means that
the filesystem must be mounted with ``-o inlinecrypt`` and inline
encryption hardware must be present. However, a software fallback
is also available. For details, see `Inline encryption support`_.
* The I/O request must be fully aligned to the filesystem block size.
This means that the file position the I/O is targeting, the lengths
of all I/O segments, and the memory addresses of all I/O buffers
must be multiples of this value. Note that the filesystem block
size may be greater than the logical block size of the block device.
If either of the above conditions is not met, then direct I/O on the
encrypted file will fall back to buffered I/O.
Implementation details Implementation details
====================== ======================
@@ -1184,6 +1268,13 @@ keys`_ and `DIRECT_KEY policies`_.
Data path changes Data path changes
----------------- -----------------
When inline encryption is used, filesystems just need to associate
encryption contexts with bios to specify how the block layer or the
inline encryption hardware will encrypt/decrypt the file contents.
When inline encryption isn't used, filesystems must encrypt/decrypt
the file contents themselves, as described below:
For the read path (->readpage()) of regular files, filesystems can For the read path (->readpage()) of regular files, filesystems can
read the ciphertext into the page cache and decrypt it in-place. The read the ciphertext into the page cache and decrypt it in-place. The
page lock must be held until decryption has finished, to prevent the page lock must be held until decryption has finished, to prevent the
@@ -1197,18 +1288,6 @@ buffer. Some filesystems, such as UBIFS, already use temporary
buffers regardless of encryption. Other filesystems, such as ext4 and buffers regardless of encryption. Other filesystems, such as ext4 and
F2FS, have to allocate bounce pages specially for encryption. F2FS, have to allocate bounce pages specially for encryption.
Fscrypt is also able to use inline encryption hardware instead of the
kernel crypto API for en/decryption of file contents. When possible,
and if directed to do so (by specifying the 'inlinecrypt' mount option
for an ext4/F2FS filesystem), it adds encryption contexts to bios and
uses blk-crypto to perform the en/decryption instead of making use of
the above read/write path changes. Of course, even if directed to
make use of inline encryption, fscrypt will only be able to do so if
either hardware inline encryption support is available for the
selected encryption algorithm or CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK
is selected. If neither is the case, fscrypt will fall back to using
the above mentioned read/write path changes for en/decryption.
Filename hashing and encoding Filename hashing and encoding
----------------------------- -----------------------------

View File

@@ -0,0 +1,85 @@
.. SPDX-License-Identifier: GPL-2.0
=================================================
incfs: A stacked incremental filesystem for Linux
=================================================
/sys/fs interface
=================
Please update Documentation/ABI/testing/sysfs-fs-incfs if you update this
section.
incfs creates the following files in /sys/fs.
Features
--------
/sys/fs/incremental-fs/features/corefs
Reads 'supported'. Always present.
/sys/fs/incremental-fs/features/v2
Reads 'supported'. Present if all v2 features of incfs are supported. These
are:
fs-verity support
inotify support
ioclts:
INCFS_IOC_SET_READ_TIMEOUTS
INCFS_IOC_GET_READ_TIMEOUTS
INCFS_IOC_GET_BLOCK_COUNT
INCFS_IOC_CREATE_MAPPED_FILE
.incomplete folder
.blocks_written pseudo file
report_uid mount option
/sys/fs/incremental-fs/features/zstd
Reads 'supported'. Present if zstd compression is supported for data blocks.
/sys/fs/incremental-fs/features/bugfix_throttling
Reads 'supported'. Present if the throttling lock bug is fixed
Optional per mount
------------------
For each incfs mount, the mount option sysfs_name=[name] creates a /sys/fs
node called:
/sys/fs/incremental-fs/instances/[name]
This will contain the following files:
/sys/fs/incremental-fs/instances/[name]/reads_delayed_min
Returns a count of the number of reads that were delayed as a result of the
per UID read timeouts min time setting.
/sys/fs/incremental-fs/instances/[name]/reads_delayed_min_us
Returns total delay time for all files since first mount as a result of the
per UID read timeouts min time setting.
/sys/fs/incremental-fs/instances/[name]/reads_delayed_pending
Returns a count of the number of reads that were delayed as a result of
waiting for a pending read.
/sys/fs/incremental-fs/instances/[name]/reads_delayed_pending_us
Returns total delay time for all files since first mount as a result of
waiting for a pending read.
/sys/fs/incremental-fs/instances/[name]/reads_failed_hash_verification
Returns number of reads that failed because of hash verification failures.
/sys/fs/incremental-fs/instances/[name]/reads_failed_other
Returns number of reads that failed for reasons other than timing out or
hash failures.
/sys/fs/incremental-fs/instances/[name]/reads_failed_timed_out
Returns number of reads that timed out.
For reads_delayed_*** settings, note that a file can count for both
reads_delayed_min and reads_delayed_pending if incfs first waits for a pending
read then has to wait further for the min time. In that case, the time spent
waiting is split between reads_delayed_pending_us, which is increased by the
time spent waiting for the pending read, and reads_delayed_min_us, which is
increased by the remainder of the time spent waiting.
Reads that timed out are not added to the reads_delayed_pending or the
reads_delayed_pending_us counters.

View File

@@ -195,7 +195,7 @@ handle it in two different ways:
1. return EXDEV error: this error is returned by rename(2) when trying to 1. return EXDEV error: this error is returned by rename(2) when trying to
move a file or directory across filesystem boundaries. Hence move a file or directory across filesystem boundaries. Hence
applications are usually prepared to hande this error (mv(1) for example applications are usually prepared to handle this error (mv(1) for example
recursively copies the directory tree). This is the default behavior. recursively copies the directory tree). This is the default behavior.
2. If the "redirect_dir" feature is enabled, then the directory will be 2. If the "redirect_dir" feature is enabled, then the directory will be
@@ -324,6 +324,30 @@ and
The resulting access permissions should be the same. The difference is in The resulting access permissions should be the same. The difference is in
the time of copy (on-demand vs. up-front). the time of copy (on-demand vs. up-front).
### Non overlapping credentials
As noted above, all access to the upper, lower and work directories is the
recorded mounter's MAC and DAC credentials. The incoming accesses are
checked against the caller's credentials.
In the case where caller MAC or DAC credentials do not overlap the mounter, a
use case available in older versions of the driver, the override_creds mount
flag can be turned off. For when the use pattern has caller with legitimate
credentials where the mounter does not. For example init may have been the
mounter, but the caller would have execute or read MAC permissions where
init would not. override_creds off means all access, incoming, upper, lower
or working, will be tested against the caller.
Several unintended side effects will occur though. The caller without certain
key capabilities or lower privilege will not always be able to delete files or
directories, create nodes, or search some restricted directories. The ability
to search and read a directory entry is spotty as a result of the cache
mechanism not re-testing the credentials because of the assumption, a
privileged caller can fill cache, then a lower privilege can read the directory
cache. The uneven security model where cache, upperdir and workdir are opened
at privilege, but accessed without creating a form of privilege escalation,
should only be used with strict understanding of the side effects and of the
security policies.
Multiple lower layers Multiple lower layers
--------------------- ---------------------

View File

@@ -426,12 +426,14 @@ with the memory region, as the case would be with BSS (uninitialized data).
The "pathname" shows the name associated file for this mapping. If the mapping The "pathname" shows the name associated file for this mapping. If the mapping
is not associated with a file: is not associated with a file:
======= ==================================== ============= ====================================
[heap] the heap of the program [heap] the heap of the program
[stack] the stack of the main process [stack] the stack of the main process
[vdso] the "virtual dynamic shared object", [vdso] the "virtual dynamic shared object",
the kernel system call handler the kernel system call handler
======= ==================================== [anon:<name>] an anonymous mapping that has been
named by userspace
============= ====================================
or if empty, the mapping is anonymous. or if empty, the mapping is anonymous.
@@ -971,6 +973,7 @@ You may not have all of these fields.
SReclaimable: 159856 kB SReclaimable: 159856 kB
SUnreclaim: 124508 kB SUnreclaim: 124508 kB
PageTables: 24448 kB PageTables: 24448 kB
SecPageTables: 0 kB
NFS_Unstable: 0 kB NFS_Unstable: 0 kB
Bounce: 0 kB Bounce: 0 kB
WritebackTmp: 0 kB WritebackTmp: 0 kB
@@ -1065,6 +1068,9 @@ SUnreclaim
PageTables PageTables
amount of memory dedicated to the lowest level of page amount of memory dedicated to the lowest level of page
tables. tables.
SecPageTables
Memory consumed by secondary page tables, this currently
currently includes KVM mmu allocations on x86 and arm64.
NFS_Unstable NFS_Unstable
Always zero. Previous counted pages which had been written to Always zero. Previous counted pages which had been written to
the server, but has not been committed to stable storage. the server, but has not been committed to stable storage.

View File

@@ -77,6 +77,17 @@ HOSTLDLIBS
---------- ----------
Additional libraries to link against when building host programs. Additional libraries to link against when building host programs.
.. _userkbuildflags:
USERCFLAGS
----------
Additional options used for $(CC) when compiling userprogs.
USERLDFLAGS
-----------
Additional options used for $(LD) when linking userprogs. userprogs are linked
with CC, so $(USERLDFLAGS) should include "-Wl," prefix as applicable.
KBUILD_KCONFIG KBUILD_KCONFIG
-------------- --------------
Set the top-level Kconfig file to the value of this environment Set the top-level Kconfig file to the value of this environment

View File

@@ -982,6 +982,8 @@ The syntax is quite similar. The difference is to use "userprogs" instead of
When linking bpfilter_umh, it will be passed the extra option -static. When linking bpfilter_umh, it will be passed the extra option -static.
From command line, :ref:`USERCFLAGS and USERLDFLAGS <userkbuildflags>` will also be used.
5.4 When userspace programs are actually built 5.4 When userspace programs are actually built
---------------------------------------------- ----------------------------------------------

View File

@@ -21,6 +21,7 @@ This document describes how to build an out-of-tree kernel module.
--- 4.1 Kernel Includes --- 4.1 Kernel Includes
--- 4.2 Single Subdirectory --- 4.2 Single Subdirectory
--- 4.3 Several Subdirectories --- 4.3 Several Subdirectories
--- 4.4 UAPI Headers Installation
=== 5. Module Installation === 5. Module Installation
--- 5.1 INSTALL_MOD_PATH --- 5.1 INSTALL_MOD_PATH
--- 5.2 INSTALL_MOD_DIR --- 5.2 INSTALL_MOD_DIR
@@ -131,6 +132,10 @@ executed to make module versioning work.
/lib/modules/<kernel_release>/extra/, but a prefix may /lib/modules/<kernel_release>/extra/, but a prefix may
be added with INSTALL_MOD_PATH (discussed in section 5). be added with INSTALL_MOD_PATH (discussed in section 5).
headers_install
Export headers in a format suitable for userspace. The default
location is $PWD/usr. INSTALL_HDR_PATH can change this path.
clean clean
Remove all generated files in the module directory only. Remove all generated files in the module directory only.
@@ -406,6 +411,17 @@ according to the following rule:
pointing to the directory where the currently executing kbuild pointing to the directory where the currently executing kbuild
file is located. file is located.
4.4 UAPI Headers Installation
-----------------------------
External modules may export headers to userspace in a similar
fashion to the in-tree counterpart drivers. kbuild supports
running headers_install target in an out-of-tree. The location
where kbuild searches for headers is $(M)/include/uapi and
$(M)/arch/$(SRCARCH)/include/uapi.
See also Documentation/kbuild/headers_install.rst.
5. Module Installation 5. Module Installation
====================== ======================

View File

@@ -1133,6 +1133,19 @@ ip_local_reserved_ports - list of comma separated ranges
Default: Empty Default: Empty
ip_local_unbindable_ports - list of comma separated ranges
Specify the ports which are not directly bind()able.
Usually you would use this to block the use of ports which
are invalid due to something outside of the control of the
kernel. For example a port stolen by the nic for serial
console, remote power management or debugging.
There's a relatively high chance you will also want to list
these ports in 'ip_local_reserved_ports' to prevent autobinding.
Default: Empty
ip_unprivileged_port_start - INTEGER ip_unprivileged_port_start - INTEGER
This is a per-namespace sysctl. It defines the first This is a per-namespace sysctl. It defines the first
unprivileged port in the network namespace. Privileged ports unprivileged port in the network namespace. Privileged ports

View File

@@ -402,7 +402,7 @@ Consequently, the only sane governor to use together with EAS is schedutil,
because it is the only one providing some degree of consistency between because it is the only one providing some degree of consistency between
frequency requests and energy predictions. frequency requests and energy predictions.
Using EAS with any other governor than schedutil is not supported. Using EAS with any other governor than schedutil is not recommended.
6.5 Scale-invariant utilization signals 6.5 Scale-invariant utilization signals

View File

@@ -93,16 +93,19 @@ function
1. invokes optional hostt->eh_timed_out() callback. Return value can 1. invokes optional hostt->eh_timed_out() callback. Return value can
be one of be one of
- BLK_EH_RESET_TIMER - SCSI_EH_RESET_TIMER
This indicates that more time is required to finish the This indicates that more time is required to finish the
command. Timer is restarted. This action is counted as a command. Timer is restarted. This action is counted as a
retry and only allowed scmd->allowed + 1(!) times. Once the retry and only allowed scmd->allowed + 1(!) times. Once the
limit is reached, action for BLK_EH_DONE is taken instead. limit is reached, action for BLK_EH_DONE is taken instead.
- BLK_EH_DONE - SCSI_EH_NOT_HANDLED
eh_timed_out() callback did not handle the command. eh_timed_out() callback did not handle the command.
Step #2 is taken. Step #2 is taken.
- SCSI_EH_DONE
eh_timed_out() completed the command.
2. scsi_abort_command() is invoked to schedule an asynchrous abort. 2. scsi_abort_command() is invoked to schedule an asynchrous abort.
Asynchronous abort are not invoked for commands which the Asynchronous abort are not invoked for commands which the
SCSI_EH_ABORT_SCHEDULED flag is set (this indicates that the command SCSI_EH_ABORT_SCHEDULED flag is set (this indicates that the command

View File

@@ -17,6 +17,8 @@ Universal Flash Storage
3.2 UTP Transfer requests 3.2 UTP Transfer requests
3.3 UFS error handling 3.3 UFS error handling
3.4 SCSI Error handling 3.4 SCSI Error handling
4. BSG Support
5. UFS Reference Clock Frequency configuration
1. Overview 1. Overview
@@ -193,3 +195,16 @@ UFS Specifications can be found at:
- UFS - http://www.jedec.org/sites/default/files/docs/JESD220.pdf - UFS - http://www.jedec.org/sites/default/files/docs/JESD220.pdf
- UFSHCI - http://www.jedec.org/sites/default/files/docs/JESD223.pdf - UFSHCI - http://www.jedec.org/sites/default/files/docs/JESD223.pdf
5. UFS Reference Clock Frequency configuration
==============================================
Devicetree can define a clock named "ref_clk" under the UFS controller node
to specify the intended reference clock frequency for the UFS storage
parts. ACPI-based system can specify the frequency using ACPI
Device-Specific Data property named "ref-clk-freq". In both ways the value
is interpreted as frequency in Hz and must match one of the values given in
the UFS specification. UFS subsystem will attempt to read the value when
executing common controller initialization. If the value is available, UFS
subsytem will ensure the bRefClkFreq attribute of the UFS storage device is
set accordingly and will modify it if there is a mismatch.

View File

@@ -100,6 +100,15 @@ amidi_map
MIDI device number maps assigned to the 2st OSS device; MIDI device number maps assigned to the 2st OSS device;
Default: 1 Default: 1
Module snd-soc-core
-------------------
The soc core module. It is used by all ALSA card drivers.
It takes the following options which have global effects.
prealloc_buffer_size_kbytes
Specify prealloc buffer size in kbytes (default: 512).
Common parameters for top sound card modules Common parameters for top sound card modules
-------------------------------------------- --------------------------------------------

View File

@@ -1763,6 +1763,21 @@ using the same key and variable from yet another event::
# echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ...' >> event3/trigger # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ...' >> event3/trigger
Expressions support the use of addition, subtraction, multiplication and
division operators (+-\*/).
Note if division by zero cannot be detected at parse time (i.e. the
divisor is not a constant), the result will be -1.
Numeric constants can also be used directly in an expression::
# echo 'hist:keys=next_pid:timestamp_secs=common_timestamp/1000000 ...' >> event/trigger
or assigned to a variable and referenced in a subsequent expression::
# echo 'hist:keys=next_pid:us_per_sec=1000000 ...' >> event/trigger
# echo 'hist:keys=next_pid:timestamp_secs=common_timestamp/$us_per_sec ...' >> event/trigger
2.2.2 Synthetic Events 2.2.2 Synthetic Events
---------------------- ----------------------

View File

@@ -784,6 +784,7 @@ The uvc function provides these attributes in its function directory:
streaming_maxpacket maximum packet size this endpoint is capable of streaming_maxpacket maximum packet size this endpoint is capable of
sending or receiving when this configuration is sending or receiving when this configuration is
selected selected
function_name name of the interface
=================== ================================================ =================== ================================================
There are also "control" and "streaming" subdirectories, each of which contain There are also "control" and "streaming" subdirectories, each of which contain

View File

@@ -242,8 +242,17 @@ Control IDs
* - ``V4L2_COLORFX_SET_CBCR`` * - ``V4L2_COLORFX_SET_CBCR``
- The Cb and Cr chroma components are replaced by fixed coefficients - The Cb and Cr chroma components are replaced by fixed coefficients
determined by ``V4L2_CID_COLORFX_CBCR`` control. determined by ``V4L2_CID_COLORFX_CBCR`` control.
* - ``V4L2_COLORFX_SET_RGB``
- The RGB components are replaced by the fixed RGB components determined
by ``V4L2_CID_COLORFX_RGB`` control.
``V4L2_CID_COLORFX_RGB`` ``(integer)``
Determines the Red, Green, and Blue coefficients for
``V4L2_COLORFX_SET_RGB`` color effect.
Bits [7:0] of the supplied 32 bit value are interpreted as Blue component,
bits [15:8] as Green component, bits [23:16] as Red component, and
bits [31:24] must be zero.
``V4L2_CID_COLORFX_CBCR`` ``(integer)`` ``V4L2_CID_COLORFX_CBCR`` ``(integer)``
Determines the Cb and Cr coefficients for ``V4L2_COLORFX_SET_CBCR`` Determines the Cb and Cr coefficients for ``V4L2_COLORFX_SET_CBCR``

View File

@@ -414,7 +414,7 @@ kvm_run' (see below).
----------------- -----------------
:Capability: basic :Capability: basic
:Architectures: all except ARM, arm64 :Architectures: all except arm64
:Type: vcpu ioctl :Type: vcpu ioctl
:Parameters: struct kvm_regs (out) :Parameters: struct kvm_regs (out)
:Returns: 0 on success, -1 on error :Returns: 0 on success, -1 on error
@@ -447,7 +447,7 @@ Reads the general purpose registers from the vcpu.
----------------- -----------------
:Capability: basic :Capability: basic
:Architectures: all except ARM, arm64 :Architectures: all except arm64
:Type: vcpu ioctl :Type: vcpu ioctl
:Parameters: struct kvm_regs (in) :Parameters: struct kvm_regs (in)
:Returns: 0 on success, -1 on error :Returns: 0 on success, -1 on error
@@ -804,7 +804,7 @@ Writes the floating point state to the vcpu.
----------------------- -----------------------
:Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390) :Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390)
:Architectures: x86, ARM, arm64, s390 :Architectures: x86, arm64, s390
:Type: vm ioctl :Type: vm ioctl
:Parameters: none :Parameters: none
:Returns: 0 on success, -1 on error :Returns: 0 on success, -1 on error
@@ -813,7 +813,7 @@ Creates an interrupt controller model in the kernel.
On x86, creates a virtual ioapic, a virtual PIC (two PICs, nested), and sets up On x86, creates a virtual ioapic, a virtual PIC (two PICs, nested), and sets up
future vcpus to have a local APIC. IRQ routing for GSIs 0-15 is set to both future vcpus to have a local APIC. IRQ routing for GSIs 0-15 is set to both
PIC and IOAPIC; GSI 16-23 only go to the IOAPIC. PIC and IOAPIC; GSI 16-23 only go to the IOAPIC.
On ARM/arm64, a GICv2 is created. Any other GIC versions require the usage of On arm64, a GICv2 is created. Any other GIC versions require the usage of
KVM_CREATE_DEVICE, which also supports creating a GICv2. Using KVM_CREATE_DEVICE, which also supports creating a GICv2. Using
KVM_CREATE_DEVICE is preferred over KVM_CREATE_IRQCHIP for GICv2. KVM_CREATE_DEVICE is preferred over KVM_CREATE_IRQCHIP for GICv2.
On s390, a dummy irq routing table is created. On s390, a dummy irq routing table is created.
@@ -826,7 +826,7 @@ before KVM_CREATE_IRQCHIP can be used.
----------------- -----------------
:Capability: KVM_CAP_IRQCHIP :Capability: KVM_CAP_IRQCHIP
:Architectures: x86, arm, arm64 :Architectures: x86, arm64
:Type: vm ioctl :Type: vm ioctl
:Parameters: struct kvm_irq_level :Parameters: struct kvm_irq_level
:Returns: 0 on success, -1 on error :Returns: 0 on success, -1 on error
@@ -850,7 +850,7 @@ capability is present (or unless it is not using the in-kernel irqchip,
of course). of course).
ARM/arm64 can signal an interrupt either at the CPU level, or at the arm64 can signal an interrupt either at the CPU level, or at the
in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to
use PPIs designated for specific cpus. The irq field is interpreted use PPIs designated for specific cpus. The irq field is interpreted
like this:: like this::
@@ -876,7 +876,7 @@ When KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 is supported, the target vcpu is
identified as (256 * vcpu2_index + vcpu_index). Otherwise, vcpu2_index identified as (256 * vcpu2_index + vcpu_index). Otherwise, vcpu2_index
must be zero. must be zero.
Note that on arm/arm64, the KVM_CAP_IRQCHIP capability only conditions Note that on arm64, the KVM_CAP_IRQCHIP capability only conditions
injection of interrupts for the in-kernel irqchip. KVM_IRQ_LINE can always injection of interrupts for the in-kernel irqchip. KVM_IRQ_LINE can always
be used for a userspace interrupt controller. be used for a userspace interrupt controller.
@@ -1037,7 +1037,7 @@ such as migration.
:Capability: KVM_CAP_VCPU_EVENTS :Capability: KVM_CAP_VCPU_EVENTS
:Extended by: KVM_CAP_INTR_SHADOW :Extended by: KVM_CAP_INTR_SHADOW
:Architectures: x86, arm, arm64 :Architectures: x86, arm64
:Type: vcpu ioctl :Type: vcpu ioctl
:Parameters: struct kvm_vcpu_event (out) :Parameters: struct kvm_vcpu_event (out)
:Returns: 0 on success, -1 on error :Returns: 0 on success, -1 on error
@@ -1096,8 +1096,8 @@ The following bits are defined in the flags field:
fields contain a valid state. This bit will be set whenever fields contain a valid state. This bit will be set whenever
KVM_CAP_EXCEPTION_PAYLOAD is enabled. KVM_CAP_EXCEPTION_PAYLOAD is enabled.
ARM/ARM64: ARM64:
^^^^^^^^^^ ^^^^^^
If the guest accesses a device that is being emulated by the host kernel in If the guest accesses a device that is being emulated by the host kernel in
such a way that a real device would generate a physical SError, KVM may make such a way that a real device would generate a physical SError, KVM may make
@@ -1156,7 +1156,7 @@ directly to the virtual CPU).
:Capability: KVM_CAP_VCPU_EVENTS :Capability: KVM_CAP_VCPU_EVENTS
:Extended by: KVM_CAP_INTR_SHADOW :Extended by: KVM_CAP_INTR_SHADOW
:Architectures: x86, arm, arm64 :Architectures: x86, arm64
:Type: vcpu ioctl :Type: vcpu ioctl
:Parameters: struct kvm_vcpu_event (in) :Parameters: struct kvm_vcpu_event (in)
:Returns: 0 on success, -1 on error :Returns: 0 on success, -1 on error
@@ -1191,8 +1191,8 @@ can be set in the flags field to signal that the
exception_has_payload, exception_payload, and exception.pending fields exception_has_payload, exception_payload, and exception.pending fields
contain a valid state and shall be written into the VCPU. contain a valid state and shall be written into the VCPU.
ARM/ARM64: ARM64:
^^^^^^^^^^ ^^^^^^
User space may need to inject several types of events to the guest. User space may need to inject several types of events to the guest.
@@ -1399,7 +1399,7 @@ for vm-wide capabilities.
--------------------- ---------------------
:Capability: KVM_CAP_MP_STATE :Capability: KVM_CAP_MP_STATE
:Architectures: x86, s390, arm, arm64 :Architectures: x86, s390, arm64
:Type: vcpu ioctl :Type: vcpu ioctl
:Parameters: struct kvm_mp_state (out) :Parameters: struct kvm_mp_state (out)
:Returns: 0 on success; -1 on error :Returns: 0 on success; -1 on error
@@ -1416,7 +1416,7 @@ uniprocessor guests).
Possible values are: Possible values are:
========================== =============================================== ========================== ===============================================
KVM_MP_STATE_RUNNABLE the vcpu is currently running [x86,arm/arm64] KVM_MP_STATE_RUNNABLE the vcpu is currently running [x86,arm64]
KVM_MP_STATE_UNINITIALIZED the vcpu is an application processor (AP) KVM_MP_STATE_UNINITIALIZED the vcpu is an application processor (AP)
which has not yet received an INIT signal [x86] which has not yet received an INIT signal [x86]
KVM_MP_STATE_INIT_RECEIVED the vcpu has received an INIT signal, and is KVM_MP_STATE_INIT_RECEIVED the vcpu has received an INIT signal, and is
@@ -1425,29 +1425,52 @@ Possible values are:
is waiting for an interrupt [x86] is waiting for an interrupt [x86]
KVM_MP_STATE_SIPI_RECEIVED the vcpu has just received a SIPI (vector KVM_MP_STATE_SIPI_RECEIVED the vcpu has just received a SIPI (vector
accessible via KVM_GET_VCPU_EVENTS) [x86] accessible via KVM_GET_VCPU_EVENTS) [x86]
KVM_MP_STATE_STOPPED the vcpu is stopped [s390,arm/arm64] KVM_MP_STATE_STOPPED the vcpu is stopped [s390,arm64]
KVM_MP_STATE_CHECK_STOP the vcpu is in a special error state [s390] KVM_MP_STATE_CHECK_STOP the vcpu is in a special error state [s390]
KVM_MP_STATE_OPERATING the vcpu is operating (running or halted) KVM_MP_STATE_OPERATING the vcpu is operating (running or halted)
[s390] [s390]
KVM_MP_STATE_LOAD the vcpu is in a special load/startup state KVM_MP_STATE_LOAD the vcpu is in a special load/startup state
[s390] [s390]
KVM_MP_STATE_SUSPENDED the vcpu is in a suspend state and is waiting
for a wakeup event [arm64]
========================== =============================================== ========================== ===============================================
On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
in-kernel irqchip, the multiprocessing state must be maintained by userspace on in-kernel irqchip, the multiprocessing state must be maintained by userspace on
these architectures. these architectures.
For arm/arm64: For arm64:
^^^^^^^^^^^^^^ ^^^^^^^^^^
The only states that are valid are KVM_MP_STATE_STOPPED and If a vCPU is in the KVM_MP_STATE_SUSPENDED state, KVM will emulate the
KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not. architectural execution of a WFI instruction.
If a wakeup event is recognized, KVM will exit to userspace with a
KVM_SYSTEM_EVENT exit, where the event type is KVM_SYSTEM_EVENT_WAKEUP. If
userspace wants to honor the wakeup, it must set the vCPU's MP state to
KVM_MP_STATE_RUNNABLE. If it does not, KVM will continue to await a wakeup
event in subsequent calls to KVM_RUN.
.. warning::
If userspace intends to keep the vCPU in a SUSPENDED state, it is
strongly recommended that userspace take action to suppress the
wakeup event (such as masking an interrupt). Otherwise, subsequent
calls to KVM_RUN will immediately exit with a KVM_SYSTEM_EVENT_WAKEUP
event and inadvertently waste CPU cycles.
Additionally, if userspace takes action to suppress a wakeup event,
it is strongly recommended that it also restores the vCPU to its
original state when the vCPU is made RUNNABLE again. For example,
if userspace masked a pending interrupt to suppress the wakeup,
the interrupt should be unmasked before returning control to the
guest.
4.39 KVM_SET_MP_STATE 4.39 KVM_SET_MP_STATE
--------------------- ---------------------
:Capability: KVM_CAP_MP_STATE :Capability: KVM_CAP_MP_STATE
:Architectures: x86, s390, arm, arm64 :Architectures: x86, s390, arm64
:Type: vcpu ioctl :Type: vcpu ioctl
:Parameters: struct kvm_mp_state (in) :Parameters: struct kvm_mp_state (in)
:Returns: 0 on success; -1 on error :Returns: 0 on success; -1 on error
@@ -1459,8 +1482,8 @@ On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
in-kernel irqchip, the multiprocessing state must be maintained by userspace on in-kernel irqchip, the multiprocessing state must be maintained by userspace on
these architectures. these architectures.
For arm/arm64: For arm64:
^^^^^^^^^^^^^^ ^^^^^^^^^^
The only states that are valid are KVM_MP_STATE_STOPPED and The only states that are valid are KVM_MP_STATE_STOPPED and
KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not. KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not.
@@ -1715,14 +1738,14 @@ The flags bitmap is defined as::
------------------------ ------------------------
:Capability: KVM_CAP_IRQ_ROUTING :Capability: KVM_CAP_IRQ_ROUTING
:Architectures: x86 s390 arm arm64 :Architectures: x86 s390 arm64
:Type: vm ioctl :Type: vm ioctl
:Parameters: struct kvm_irq_routing (in) :Parameters: struct kvm_irq_routing (in)
:Returns: 0 on success, -1 on error :Returns: 0 on success, -1 on error
Sets the GSI routing table entries, overwriting any previously set entries. Sets the GSI routing table entries, overwriting any previously set entries.
On arm/arm64, GSI routing has the following limitation: On arm64, GSI routing has the following limitation:
- GSI routing does not apply to KVM_IRQ_LINE but only to KVM_IRQFD. - GSI routing does not apply to KVM_IRQ_LINE but only to KVM_IRQFD.
@@ -2526,6 +2549,24 @@ EINVAL.
After the vcpu's SVE configuration is finalized, further attempts to After the vcpu's SVE configuration is finalized, further attempts to
write this register will fail with EPERM. write this register will fail with EPERM.
arm64 bitmap feature firmware pseudo-registers have the following bit pattern::
0x6030 0000 0016 <regno:16>
The bitmap feature firmware registers exposes the hypercall services that
are available for userspace to configure. The set bits corresponds to the
services that are available for the guests to access. By default, KVM
sets all the supported bits during VM initialization. The userspace can
discover the available services via KVM_GET_ONE_REG, and write back the
bitmap corresponding to the features that it wishes guests to see via
KVM_SET_ONE_REG.
Note: These registers are immutable once any of the vCPUs of the VM has
run at least once. A KVM_SET_ONE_REG in such a scenario will return
a -EBUSY to userspace.
(See Documentation/virt/kvm/arm/hypercalls.rst for more details.)
MIPS registers are mapped using the lower 32 bits. The upper 16 of that is MIPS registers are mapped using the lower 32 bits. The upper 16 of that is
the register group type: the register group type:
@@ -2636,7 +2677,7 @@ after pausing the vcpu, but before it is resumed.
------------------- -------------------
:Capability: KVM_CAP_SIGNAL_MSI :Capability: KVM_CAP_SIGNAL_MSI
:Architectures: x86 arm arm64 :Architectures: x86 arm64
:Type: vm ioctl :Type: vm ioctl
:Parameters: struct kvm_msi (in) :Parameters: struct kvm_msi (in)
:Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error :Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error
@@ -2824,7 +2865,7 @@ into the hash PTE second double word).
-------------- --------------
:Capability: KVM_CAP_IRQFD :Capability: KVM_CAP_IRQFD
:Architectures: x86 s390 arm arm64 :Architectures: x86 s390 arm64
:Type: vm ioctl :Type: vm ioctl
:Parameters: struct kvm_irqfd (in) :Parameters: struct kvm_irqfd (in)
:Returns: 0 on success, -1 on error :Returns: 0 on success, -1 on error
@@ -2850,7 +2891,7 @@ Note that closing the resamplefd is not sufficient to disable the
irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
and need not be specified with KVM_IRQFD_FLAG_DEASSIGN. and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
On arm/arm64, gsi routing being supported, the following can happen: On arm64, gsi routing being supported, the following can happen:
- in case no routing entry is associated to this gsi, injection fails - in case no routing entry is associated to this gsi, injection fails
- in case the gsi is associated to an irqchip routing entry, - in case the gsi is associated to an irqchip routing entry,
@@ -3104,7 +3145,7 @@ current state. "addr" is ignored.
---------------------- ----------------------
:Capability: basic :Capability: basic
:Architectures: arm, arm64 :Architectures: arm64
:Type: vcpu ioctl :Type: vcpu ioctl
:Parameters: struct kvm_vcpu_init (in) :Parameters: struct kvm_vcpu_init (in)
:Returns: 0 on success; -1 on error :Returns: 0 on success; -1 on error
@@ -3202,7 +3243,7 @@ Possible features:
----------------------------- -----------------------------
:Capability: basic :Capability: basic
:Architectures: arm, arm64 :Architectures: arm64
:Type: vm ioctl :Type: vm ioctl
:Parameters: struct kvm_vcpu_init (out) :Parameters: struct kvm_vcpu_init (out)
:Returns: 0 on success; -1 on error :Returns: 0 on success; -1 on error
@@ -3231,7 +3272,7 @@ VCPU matching underlying host.
--------------------- ---------------------
:Capability: basic :Capability: basic
:Architectures: arm, arm64, mips :Architectures: arm64, mips
:Type: vcpu ioctl :Type: vcpu ioctl
:Parameters: struct kvm_reg_list (in/out) :Parameters: struct kvm_reg_list (in/out)
:Returns: 0 on success; -1 on error :Returns: 0 on success; -1 on error
@@ -3258,7 +3299,7 @@ KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
----------------------------------------- -----------------------------------------
:Capability: KVM_CAP_ARM_SET_DEVICE_ADDR :Capability: KVM_CAP_ARM_SET_DEVICE_ADDR
:Architectures: arm, arm64 :Architectures: arm64
:Type: vm ioctl :Type: vm ioctl
:Parameters: struct kvm_arm_device_address (in) :Parameters: struct kvm_arm_device_address (in)
:Returns: 0 on success, -1 on error :Returns: 0 on success, -1 on error
@@ -3285,13 +3326,13 @@ can access emulated or directly exposed devices, which the host kernel needs
to know about. The id field is an architecture specific identifier for a to know about. The id field is an architecture specific identifier for a
specific device. specific device.
ARM/arm64 divides the id field into two parts, a device id and an arm64 divides the id field into two parts, a device id and an
address type id specific to the individual device:: address type id specific to the individual device::
bits: | 63 ... 32 | 31 ... 16 | 15 ... 0 | bits: | 63 ... 32 | 31 ... 16 | 15 ... 0 |
field: | 0x00000000 | device id | addr type id | field: | 0x00000000 | device id | addr type id |
ARM/arm64 currently only require this when using the in-kernel GIC arm64 currently only require this when using the in-kernel GIC
support for the hardware VGIC features, using KVM_ARM_DEVICE_VGIC_V2 support for the hardware VGIC features, using KVM_ARM_DEVICE_VGIC_V2
as the device id. When setting the base address for the guest's as the device id. When setting the base address for the guest's
mapping of the VGIC virtual CPU and distributor interface, the ioctl mapping of the VGIC virtual CPU and distributor interface, the ioctl
@@ -4505,7 +4546,7 @@ to I/O ports.
------------------------------------ ------------------------------------
:Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 :Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2
:Architectures: x86, arm, arm64, mips :Architectures: x86, arm64, mips
:Type: vm ioctl :Type: vm ioctl
:Parameters: struct kvm_clear_dirty_log (in) :Parameters: struct kvm_clear_dirty_log (in)
:Returns: 0 on success, -1 on error :Returns: 0 on success, -1 on error
@@ -4617,7 +4658,7 @@ version has the following quirks:
4.119 KVM_ARM_VCPU_FINALIZE 4.119 KVM_ARM_VCPU_FINALIZE
--------------------------- ---------------------------
:Architectures: arm, arm64 :Architectures: arm64
:Type: vcpu ioctl :Type: vcpu ioctl
:Parameters: int feature (in) :Parameters: int feature (in)
:Returns: 0 on success, -1 on error :Returns: 0 on success, -1 on error
@@ -5656,13 +5697,15 @@ should put the acknowledged interrupt vector into the 'epr' field.
#define KVM_SYSTEM_EVENT_SHUTDOWN 1 #define KVM_SYSTEM_EVENT_SHUTDOWN 1
#define KVM_SYSTEM_EVENT_RESET 2 #define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3 #define KVM_SYSTEM_EVENT_CRASH 3
#define KVM_SYSTEM_EVENT_WAKEUP 4
#define KVM_SYSTEM_EVENT_SUSPEND 5
__u32 type; __u32 type;
__u64 flags; __u64 flags;
} system_event; } system_event;
If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered
a system-level event using some architecture specific mechanism (hypercall a system-level event using some architecture specific mechanism (hypercall
or some special instruction). In case of ARM/ARM64, this is triggered using or some special instruction). In case of ARM64, this is triggered using
HVC instruction based PSCI call from the vcpu. The 'type' field describes HVC instruction based PSCI call from the vcpu. The 'type' field describes
the system-level event type. The 'flags' field describes architecture the system-level event type. The 'flags' field describes architecture
specific flags for the system-level event. specific flags for the system-level event.
@@ -5680,6 +5723,42 @@ Valid values for 'type' are:
has requested a crash condition maintenance. Userspace can choose has requested a crash condition maintenance. Userspace can choose
to ignore the request, or to gather VM memory core dump and/or to ignore the request, or to gather VM memory core dump and/or
reset/shutdown of the VM. reset/shutdown of the VM.
- KVM_SYSTEM_EVENT_WAKEUP -- the exiting vCPU is in a suspended state and
KVM has recognized a wakeup event. Userspace may honor this event by
marking the exiting vCPU as runnable, or deny it and call KVM_RUN again.
- KVM_SYSTEM_EVENT_SUSPEND -- the guest has requested a suspension of
the VM.
For arm/arm64:
--------------
KVM_SYSTEM_EVENT_SUSPEND exits are enabled with the
KVM_CAP_ARM_SYSTEM_SUSPEND VM capability. If a guest invokes the PSCI
SYSTEM_SUSPEND function, KVM will exit to userspace with this event
type.
It is the sole responsibility of userspace to implement the PSCI
SYSTEM_SUSPEND call according to ARM DEN0022D.b 5.19 "SYSTEM_SUSPEND".
KVM does not change the vCPU's state before exiting to userspace, so
the call parameters are left in-place in the vCPU registers.
Userspace is _required_ to take action for such an exit. It must
either:
- Honor the guest request to suspend the VM. Userspace can request
in-kernel emulation of suspension by setting the calling vCPU's
state to KVM_MP_STATE_SUSPENDED. Userspace must configure the vCPU's
state according to the parameters passed to the PSCI function when
the calling vCPU is resumed. See ARM DEN0022D.b 5.19.1 "Intended use"
for details on the function parameters.
- Deny the guest request to suspend the VM. See ARM DEN0022D.b 5.19.2
"Caller responsibilities" for possible return values.
Valid flags are:
- KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (arm64 only) -- the guest issued
a SYSTEM_RESET2 call according to v1.1 of the PSCI specification.
:: ::
@@ -5755,7 +5834,7 @@ in send_page or recv a buffer to recv_page).
__u64 fault_ipa; __u64 fault_ipa;
} arm_nisv; } arm_nisv;
Used on arm and arm64 systems. If a guest accesses memory not in a memslot, Used on arm64 systems. If a guest accesses memory not in a memslot,
KVM will typically return to userspace and ask it to do MMIO emulation on its KVM will typically return to userspace and ask it to do MMIO emulation on its
behalf. However, for certain classes of instructions, no instruction decode behalf. However, for certain classes of instructions, no instruction decode
(direction, length of memory access) is provided, and fetching and decoding (direction, length of memory access) is provided, and fetching and decoding
@@ -5772,16 +5851,22 @@ did not fall within an I/O window.
Userspace implementations can query for KVM_CAP_ARM_NISV_TO_USER, and enable Userspace implementations can query for KVM_CAP_ARM_NISV_TO_USER, and enable
this capability at VM creation. Once this is done, these types of errors will this capability at VM creation. Once this is done, these types of errors will
instead return to userspace with KVM_EXIT_ARM_NISV, with the valid bits from instead return to userspace with KVM_EXIT_ARM_NISV, with the valid bits from
the HSR (arm) and ESR_EL2 (arm64) in the esr_iss field, and the faulting IPA the ESR_EL2 in the esr_iss field, and the faulting IPA in the fault_ipa field.
in the fault_ipa field. Userspace can either fix up the access if it's Userspace can either fix up the access if it's actually an I/O access by
actually an I/O access by decoding the instruction from guest memory (if it's decoding the instruction from guest memory (if it's very brave) and continue
very brave) and continue executing the guest, or it can decide to suspend, executing the guest, or it can decide to suspend, dump, or restart the guest.
dump, or restart the guest.
Note that KVM does not skip the faulting instruction as it does for Note that KVM does not skip the faulting instruction as it does for
KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state
if it decides to decode and emulate the instruction. if it decides to decode and emulate the instruction.
This feature isn't available to protected VMs, as userspace does not
have access to the state that is required to perform the emulation.
Instead, a data abort exception is directly injected in the guest.
Note that although KVM_CAP_ARM_NISV_TO_USER will be reported if
queried outside of a protected VM context, the feature will not be
exposed if queried on a protected VM file descriptor.
:: ::
/* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */ /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */
@@ -6464,7 +6549,7 @@ and injected exceptions.
7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2
:Architectures: x86, arm, arm64, mips :Architectures: x86, arm64, mips
:Parameters: args[0] whether feature should be enabled or not :Parameters: args[0] whether feature should be enabled or not
Valid flags are:: Valid flags are::
@@ -6833,7 +6918,7 @@ reserved.
8.9 KVM_CAP_ARM_USER_IRQ 8.9 KVM_CAP_ARM_USER_IRQ
------------------------ ------------------------
:Architectures: arm, arm64 :Architectures: arm64
This capability, if KVM_CHECK_EXTENSION indicates that it is available, means This capability, if KVM_CHECK_EXTENSION indicates that it is available, means
that if userspace creates a VM without an in-kernel interrupt controller, it that if userspace creates a VM without an in-kernel interrupt controller, it
@@ -6960,7 +7045,7 @@ HvFlushVirtualAddressList, HvFlushVirtualAddressListEx.
8.19 KVM_CAP_ARM_INJECT_SERROR_ESR 8.19 KVM_CAP_ARM_INJECT_SERROR_ESR
---------------------------------- ----------------------------------
:Architectures: arm, arm64 :Architectures: arm64
This capability indicates that userspace can specify (via the This capability indicates that userspace can specify (via the
KVM_SET_VCPU_EVENTS ioctl) the syndrome value reported to the guest when it KVM_SET_VCPU_EVENTS ioctl) the syndrome value reported to the guest when it
@@ -7266,6 +7351,16 @@ of the result of KVM_CHECK_EXTENSION. KVM will forward to userspace
the hypercalls whose corresponding bit is in the argument, and return the hypercalls whose corresponding bit is in the argument, and return
ENOSYS for the others. ENOSYS for the others.
8.36 KVM_CAP_ARM_SYSTEM_SUSPEND
-------------------------------
:Capability: KVM_CAP_ARM_SYSTEM_SUSPEND
:Architectures: arm64
:Type: vm
When enabled, KVM will exit to userspace with KVM_EXIT_SYSTEM_EVENT of
type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request.
9. Known KVM API problems 9. Known KVM API problems
========================= =========================

View File

@@ -0,0 +1,138 @@
.. SPDX-License-Identifier: GPL-2.0
=======================================
ARM firmware pseudo-registers interface
=======================================
KVM handles the hypercall services as requested by the guests. New hypercall
services are regularly made available by the ARM specification or by KVM (as
vendor services) if they make sense from a virtualization point of view.
This means that a guest booted on two different versions of KVM can observe
two different "firmware" revisions. This could cause issues if a given guest
is tied to a particular version of a hypercall service, or if a migration
causes a different version to be exposed out of the blue to an unsuspecting
guest.
In order to remedy this situation, KVM exposes a set of "firmware
pseudo-registers" that can be manipulated using the GET/SET_ONE_REG
interface. These registers can be saved/restored by userspace, and set
to a convenient value as required.
The following registers are defined:
* KVM_REG_ARM_PSCI_VERSION:
KVM implements the PSCI (Power State Coordination Interface)
specification in order to provide services such as CPU on/off, reset
and power-off to the guest.
- Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set
(and thus has already been initialized)
- Returns the current PSCI version on GET_ONE_REG (defaulting to the
highest PSCI version implemented by KVM and compatible with v0.2)
- Allows any PSCI version implemented by KVM and compatible with
v0.2 to be set with SET_ONE_REG
- Affects the whole VM (even if the register view is per-vcpu)
* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
Holds the state of the firmware support to mitigate CVE-2017-5715, as
offered by KVM to the guest via a HVC call. The workaround is described
under SMCCC_ARCH_WORKAROUND_1 in [1].
Accepted values are:
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL:
KVM does not offer
firmware support for the workaround. The mitigation status for the
guest is unknown.
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL:
The workaround HVC call is
available to the guest and required for the mitigation.
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED:
The workaround HVC call
is available to the guest, but it is not needed on this VCPU.
* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
Holds the state of the firmware support to mitigate CVE-2018-3639, as
offered by KVM to the guest via a HVC call. The workaround is described
under SMCCC_ARCH_WORKAROUND_2 in [1]_.
Accepted values are:
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
A workaround is not
available. KVM does not offer firmware support for the workaround.
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
The workaround state is
unknown. KVM does not offer firmware support for the workaround.
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
The workaround is available,
and can be disabled by a vCPU. If
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for
this vCPU.
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
The workaround is always active on this vCPU or it is not needed.
Bitmap Feature Firmware Registers
---------------------------------
Contrary to the above registers, the following registers exposes the
hypercall services in the form of a feature-bitmap to the userspace. This
bitmap is translated to the services that are available to the guest.
There is a register defined per service call owner and can be accessed via
GET/SET_ONE_REG interface.
By default, these registers are set with the upper limit of the features
that are supported. This way userspace can discover all the usable
hypercall services via GET_ONE_REG. The user-space can write-back the
desired bitmap back via SET_ONE_REG. The features for the registers that
are untouched, probably because userspace isn't aware of them, will be
exposed as is to the guest.
Note that KVM will not allow the userspace to configure the registers
anymore once any of the vCPUs has run at least once. Instead, it will
return a -EBUSY.
The pseudo-firmware bitmap register are as follows:
* KVM_REG_ARM_STD_BMAP:
Controls the bitmap of the ARM Standard Secure Service Calls.
The following bits are accepted:
Bit-0: KVM_REG_ARM_STD_BIT_TRNG_V1_0:
The bit represents the services offered under v1.0 of ARM True Random
Number Generator (TRNG) specification, ARM DEN0098.
* KVM_REG_ARM_STD_HYP_BMAP:
Controls the bitmap of the ARM Standard Hypervisor Service Calls.
The following bits are accepted:
Bit-0: KVM_REG_ARM_STD_HYP_BIT_PV_TIME:
The bit represents the Paravirtualized Time service as represented by
ARM DEN0057A.
* KVM_REG_ARM_VENDOR_HYP_BMAP:
Controls the bitmap of the Vendor specific Hypervisor Service Calls.
The following bits are accepted:
Bit-0: KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT
The bit represents the ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID
and ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID function-ids.
Bit-1: KVM_REG_ARM_VENDOR_HYP_BIT_PTP:
The bit represents the Precision Time Protocol KVM service.
Errors:
======= =============================================================
-ENOENT Unknown register accessed.
-EBUSY Attempt a 'write' to the register after the VM has started.
-EINVAL Invalid bitmap written to the register.
======= =============================================================
.. [1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf

View File

@@ -0,0 +1,152 @@
.. SPDX-License-Identifier: GPL-2.0
===============================================
KVM/arm64-specific hypercalls exposed to guests
===============================================
This file documents the KVM/arm64-specific hypercalls which may be
exposed by KVM/arm64 to guest operating systems. These hypercalls are
issued using the HVC instruction according to version 1.1 of the Arm SMC
Calling Convention (DEN0028/C):
https://developer.arm.com/docs/den0028/c
All KVM/arm64-specific hypercalls are allocated within the "Vendor
Specific Hypervisor Service Call" range with a UID of
``28b46fb6-2ec5-11e9-a9ca-4b564d003a74``. This UID should be queried by the
guest using the standard "Call UID" function for the service range in
order to determine that the KVM/arm64-specific hypercalls are available.
``ARM_SMCCC_KVM_FUNC_FEATURES``
---------------------------------------------
Provides a discovery mechanism for other KVM/arm64 hypercalls.
+---------------------+-------------------------------------------------------------+
| Presence: | Mandatory for the KVM/arm64 UID |
+---------------------+-------------------------------------------------------------+
| Calling convention: | HVC32 |
+---------------------+----------+--------------------------------------------------+
| Function ID: | (uint32) | 0x86000000 |
+---------------------+----------+--------------------------------------------------+
| Arguments: | None |
+---------------------+----------+----+---------------------------------------------+
| Return Values: | (uint32) | R0 | Bitmap of available function numbers 0-31 |
| +----------+----+---------------------------------------------+
| | (uint32) | R1 | Bitmap of available function numbers 32-63 |
| +----------+----+---------------------------------------------+
| | (uint32) | R2 | Bitmap of available function numbers 64-95 |
| +----------+----+---------------------------------------------+
| | (uint32) | R3 | Bitmap of available function numbers 96-127 |
+---------------------+----------+----+---------------------------------------------+
``ARM_SMCCC_KVM_FUNC_PTP``
----------------------------------------
See ptp_kvm.rst
``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``
----------------------------------
Query the memory protection parameters for a protected virtual machine.
+---------------------+-------------------------------------------------------------+
| Presence: | Optional; protected guests only. |
+---------------------+-------------------------------------------------------------+
| Calling convention: | HVC64 |
+---------------------+----------+--------------------------------------------------+
| Function ID: | (uint32) | 0xC6000002 |
+---------------------+----------+----+---------------------------------------------+
| Arguments: | (uint64) | R1 | Reserved / Must be zero |
| +----------+----+---------------------------------------------+
| | (uint64) | R2 | Reserved / Must be zero |
| +----------+----+---------------------------------------------+
| | (uint64) | R3 | Reserved / Must be zero |
+---------------------+----------+----+---------------------------------------------+
| Return Values: | (int64) | R0 | ``INVALID_PARAMETER (-3)`` on error, else |
| | | | memory protection granule in bytes |
+---------------------+----------+----+---------------------------------------------+
``ARM_SMCCC_KVM_FUNC_MEM_SHARE``
--------------------------------
Share a region of memory with the KVM host, granting it read, write and execute
permissions. The size of the region is equal to the memory protection granule
advertised by ``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``.
+---------------------+-------------------------------------------------------------+
| Presence: | Optional; protected guests only. |
+---------------------+-------------------------------------------------------------+
| Calling convention: | HVC64 |
+---------------------+----------+--------------------------------------------------+
| Function ID: | (uint32) | 0xC6000003 |
+---------------------+----------+----+---------------------------------------------+
| Arguments: | (uint64) | R1 | Base IPA of memory region to share |
| +----------+----+---------------------------------------------+
| | (uint64) | R2 | Reserved / Must be zero |
| +----------+----+---------------------------------------------+
| | (uint64) | R3 | Reserved / Must be zero |
+---------------------+----------+----+---------------------------------------------+
| Return Values: | (int64) | R0 | ``SUCCESS (0)`` |
| | | +---------------------------------------------+
| | | | ``INVALID_PARAMETER (-3)`` |
+---------------------+----------+----+---------------------------------------------+
``ARM_SMCCC_KVM_FUNC_MEM_UNSHARE``
----------------------------------
Revoke access permission from the KVM host to a memory region previously shared
with ``ARM_SMCCC_KVM_FUNC_MEM_SHARE``. The size of the region is equal to the
memory protection granule advertised by ``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``.
+---------------------+-------------------------------------------------------------+
| Presence: | Optional; protected guests only. |
+---------------------+-------------------------------------------------------------+
| Calling convention: | HVC64 |
+---------------------+----------+--------------------------------------------------+
| Function ID: | (uint32) | 0xC6000004 |
+---------------------+----------+----+---------------------------------------------+
| Arguments: | (uint64) | R1 | Base IPA of memory region to unshare |
| +----------+----+---------------------------------------------+
| | (uint64) | R2 | Reserved / Must be zero |
| +----------+----+---------------------------------------------+
| | (uint64) | R3 | Reserved / Must be zero |
+---------------------+----------+----+---------------------------------------------+
| Return Values: | (int64) | R0 | ``SUCCESS (0)`` |
| | | +---------------------------------------------+
| | | | ``INVALID_PARAMETER (-3)`` |
+---------------------+----------+----+---------------------------------------------+
``ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH``
--------------------------------------
Cooperatively relinquish ownership of a memory region. The size of the
region is equal to the memory protection granule advertised by
``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``. If this hypercall is advertised
then it is mandatory to call it before freeing memory via, for
example, virtio balloon. If the caller is a protected VM, it is
guaranteed that the memory region will be completely cleared before
becoming visible to another VM.
+---------------------+-------------------------------------------------------------+
| Presence: | Optional. |
+---------------------+-------------------------------------------------------------+
| Calling convention: | HVC64 |
+---------------------+----------+--------------------------------------------------+
| Function ID: | (uint32) | 0xC6000009 |
+---------------------+----------+----+---------------------------------------------+
| Arguments: | (uint64) | R1 | Base IPA of memory region to relinquish |
| +----------+----+---------------------------------------------+
| | (uint64) | R2 | Reserved / Must be zero |
| +----------+----+---------------------------------------------+
| | (uint64) | R3 | Reserved / Must be zero |
+---------------------+----------+----+---------------------------------------------+
| Return Values: | (int64) | R0 | ``SUCCESS (0)`` |
| | | +---------------------------------------------+
| | | | ``INVALID_PARAMETER (-3)`` |
+---------------------+----------+----+---------------------------------------------+
``ARM_SMCCC_KVM_FUNC_MMIO_GUARD_*``
-----------------------------------
See mmio-guard.rst

View File

@@ -7,7 +7,10 @@ ARM
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
fw-pseudo-registers
hyp-abi hyp-abi
psci hypercalls
pkvm
pvtime pvtime
ptp_kvm ptp_kvm
mmio-guard

View File

@@ -0,0 +1,74 @@
.. SPDX-License-Identifier: GPL-2.0
==============
KVM MMIO guard
==============
KVM implements device emulation by handling translation faults to any
IPA range that is not contained in a memory slot. Such a translation
fault is in most cases passed on to userspace (or in rare cases to the
host kernel) with the address, size and possibly data of the access
for emulation.
Should the guest exit with an address that is not one that corresponds
to an emulatable device, userspace may take measures that are not the
most graceful as far as the guest is concerned (such as terminating it
or delivering a fatal exception).
There is also an element of trust: by forwarding the request to
userspace, the kernel assumes that the guest trusts userspace to do
the right thing.
The KVM MMIO guard offers a way to mitigate this last point: a guest
can request that only certain regions of the IPA space are valid as
MMIO. Only these regions will be handled as an MMIO, and any other
will result in an exception being delivered to the guest.
This relies on a set of hypercalls defined in the KVM-specific range,
using the HVC64 calling convention.
* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO
============== ======== ================================
Function ID: (uint32) 0xC6000005
Arguments: r1-r3 Reserved / Must be zero
Return Values: (int64) NOT_SUPPORTED(-1) on error, or
(uint64) Protection Granule (PG) size in
bytes (r0)
============== ======== ================================
* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL
============== ======== ==============================
Function ID: (uint32) 0xC6000006
Arguments: none
Return Values: (int64) NOT_SUPPORTED(-1) on error, or
RET_SUCCESS(0) (r0)
============== ======== ==============================
* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP
============== ======== ====================================
Function ID: (uint32) 0xC6000007
Arguments: (uint64) The base of the PG-sized IPA range
that is allowed to be accessed as
MMIO. Must be aligned to the PG size
(r1)
(uint64) Index in the MAIR_EL1 register
providing the memory attribute that
is used by the guest (r2)
Return Values: (int64) NOT_SUPPORTED(-1) on error, or
RET_SUCCESS(0) (r0)
============== ======== ====================================
* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP
============== ======== ======================================
Function ID: (uint32) 0xC6000008
Arguments: (uint64) PG-sized IPA range aligned to the PG
size which has been previously mapped.
Must be aligned to the PG size and
have been previously mapped (r1)
Return Values: (int64) NOT_SUPPORTED(-1) on error, or
RET_SUCCESS(0) (r0)
============== ======== ======================================

View File

@@ -0,0 +1,96 @@
.. SPDX-License-Identifier: GPL-2.0
Protected virtual machines (pKVM)
=================================
Introduction
------------
Protected KVM (pKVM) is a KVM/arm64 extension which uses the two-stage
translation capability of the Armv8 MMU to isolate guest memory from the host
system. This allows for the creation of a confidential computing environment
without relying on whizz-bang features in hardware, but still allowing room for
complementary technologies such as memory encryption and hardware-backed
attestation.
The major implementation change brought about by pKVM is that the hypervisor
code running at EL2 is now largely independent of (and isolated from) the rest
of the host kernel running at EL1 and therefore additional hypercalls are
introduced to manage manipulation of guest stage-2 page tables, creation of VM
data structures and reclamation of memory on teardown. An immediate consequence
of this change is that the host itself runs with an identity mapping enabled
at stage-2, providing the hypervisor code with a mechanism to restrict host
access to an arbitrary physical page.
Enabling pKVM
-------------
The pKVM hypervisor is enabled by booting the host kernel at EL2 with
"``kvm-arm.mode=protected``" on the command-line. Once enabled, VMs can be spawned
in either protected or non-protected state, although the hypervisor is still
responsible for managing most of the VM metadata in either case.
Limitations
-----------
Enabling pKVM places some significant limitations on KVM guests, regardless of
whether they are spawned in protected state. It is therefore recommended only
to enable pKVM if protected VMs are required, with non-protected state acting
primarily as a debug and development aid.
If you're still keen, then here is an incomplete list of caveats that apply
to all VMs running under pKVM:
- Guest memory cannot be file-backed (with the exception of shmem/memfd) and is
pinned as it is mapped into the guest. This prevents the host from
swapping-out, migrating, merging or generally doing anything useful with the
guest pages. It also requires that the VMM has either ``CAP_IPC_LOCK`` or
sufficient ``RLIMIT_MEMLOCK`` to account for this pinned memory.
- GICv2 is not supported and therefore GICv3 hardware is required in order
to expose a virtual GICv3 to the guest.
- Read-only memslots are unsupported and therefore dirty logging cannot be
enabled.
- Memslot configuration is fixed once a VM has started running, with subsequent
move or deletion requests being rejected with ``-EPERM``.
- There are probably many others.
Since the host is unable to tear down the hypervisor when pKVM is enabled,
hibernation (``CONFIG_HIBERNATION``) and kexec (``CONFIG_KEXEC``) will fail
with ``-EBUSY``.
If you are not happy with these limitations, then please don't enable pKVM :)
VM creation
-----------
When pKVM is enabled, protected VMs can be created by specifying the
``KVM_VM_TYPE_ARM_PROTECTED`` flag in the machine type identifier parameter
passed to ``KVM_CREATE_VM``.
Protected VMs are instantiated according to a fixed vCPU configuration
described by the ID register definitions in
``arch/arm64/include/asm/kvm_pkvm.h``. Only a subset of the architectural
features that may be available to the host are exposed to the guest and the
capabilities advertised by ``KVM_CHECK_EXTENSION`` are limited accordingly,
with the vCPU registers being initialised to their architecturally-defined
values.
Where not defined by the architecture, the registers of a protected vCPU
are reset to zero with the exception of the PC and X0 which can be set
either by the ``KVM_SET_ONE_REG`` interface or by a call to PSCI ``CPU_ON``.
VM runtime
----------
By default, memory pages mapped into a protected guest are inaccessible to the
host and any attempt by the host to access such a page will result in the
injection of an abort at EL1 by the hypervisor. For accesses originating from
EL0, the host will then terminate the current task with a ``SIGSEGV``.
pKVM exposes additional hypercalls to protected guests, primarily for the
purpose of establishing shared-memory regions with the host for communication
and I/O. These hypercalls are documented in hypercalls.rst.

View File

@@ -1,77 +0,0 @@
.. SPDX-License-Identifier: GPL-2.0
=========================================
Power State Coordination Interface (PSCI)
=========================================
KVM implements the PSCI (Power State Coordination Interface)
specification in order to provide services such as CPU on/off, reset
and power-off to the guest.
The PSCI specification is regularly updated to provide new features,
and KVM implements these updates if they make sense from a virtualization
point of view.
This means that a guest booted on two different versions of KVM can
observe two different "firmware" revisions. This could cause issues if
a given guest is tied to a particular PSCI revision (unlikely), or if
a migration causes a different PSCI version to be exposed out of the
blue to an unsuspecting guest.
In order to remedy this situation, KVM exposes a set of "firmware
pseudo-registers" that can be manipulated using the GET/SET_ONE_REG
interface. These registers can be saved/restored by userspace, and set
to a convenient value if required.
The following register is defined:
* KVM_REG_ARM_PSCI_VERSION:
- Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set
(and thus has already been initialized)
- Returns the current PSCI version on GET_ONE_REG (defaulting to the
highest PSCI version implemented by KVM and compatible with v0.2)
- Allows any PSCI version implemented by KVM and compatible with
v0.2 to be set with SET_ONE_REG
- Affects the whole VM (even if the register view is per-vcpu)
* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
Holds the state of the firmware support to mitigate CVE-2017-5715, as
offered by KVM to the guest via a HVC call. The workaround is described
under SMCCC_ARCH_WORKAROUND_1 in [1].
Accepted values are:
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL:
KVM does not offer
firmware support for the workaround. The mitigation status for the
guest is unknown.
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL:
The workaround HVC call is
available to the guest and required for the mitigation.
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED:
The workaround HVC call
is available to the guest, but it is not needed on this VCPU.
* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
Holds the state of the firmware support to mitigate CVE-2018-3639, as
offered by KVM to the guest via a HVC call. The workaround is described
under SMCCC_ARCH_WORKAROUND_2 in [1]_.
Accepted values are:
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
A workaround is not
available. KVM does not offer firmware support for the workaround.
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
The workaround state is
unknown. KVM does not offer firmware support for the workaround.
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
The workaround is available,
and can be disabled by a vCPU. If
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for
this vCPU.
KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
The workaround is always active on this vCPU or it is not needed.
.. [1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf

View File

@@ -7,19 +7,29 @@ PTP_KVM is used for high precision time sync between host and guests.
It relies on transferring the wall clock and counter value from the It relies on transferring the wall clock and counter value from the
host to the guest using a KVM-specific hypercall. host to the guest using a KVM-specific hypercall.
* ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: 0x86000001 ``ARM_SMCCC_KVM_FUNC_PTP``
----------------------------------------
This hypercall uses the SMC32/HVC32 calling convention: Retrieve current time information for the specific counter. There are no
endianness restrictions.
ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID +---------------------+-------------------------------------------------------+
============== ======== ===================================== | Presence: | Optional |
Function ID: (uint32) 0x86000001 +---------------------+-------------------------------------------------------+
Arguments: (uint32) KVM_PTP_VIRT_COUNTER(0) | Calling convention: | HVC32 |
KVM_PTP_PHYS_COUNTER(1) +---------------------+----------+--------------------------------------------+
Return Values: (int32) NOT_SUPPORTED(-1) on error, or | Function ID: | (uint32) | 0x86000001 |
(uint32) Upper 32 bits of wall clock time (r0) +---------------------+----------+----+---------------------------------------+
(uint32) Lower 32 bits of wall clock time (r1) | Arguments: | (uint32) | R1 | ``KVM_PTP_VIRT_COUNTER (0)`` |
(uint32) Upper 32 bits of counter (r2) | | | +---------------------------------------+
(uint32) Lower 32 bits of counter (r3) | | | | ``KVM_PTP_PHYS_COUNTER (1)`` |
Endianness: No Restrictions. +---------------------+----------+----+---------------------------------------+
============== ======== ===================================== | Return Values: | (int32) | R0 | ``NOT_SUPPORTED (-1)`` on error, else |
| | | | upper 32 bits of wall clock time |
| +----------+----+---------------------------------------+
| | (uint32) | R1 | Lower 32 bits of wall clock time |
| +----------+----+---------------------------------------+
| | (uint32) | R2 | Upper 32 bits of counter |
| +----------+----+---------------------------------------+
| | (uint32) | R3 | Lower 32 bits of counter |
+---------------------+----------+----+---------------------------------------+

View File

@@ -70,7 +70,7 @@ irqchip.
-ENODEV PMUv3 not supported or GIC not initialized -ENODEV PMUv3 not supported or GIC not initialized
-ENXIO PMUv3 not properly configured or in-kernel irqchip not -ENXIO PMUv3 not properly configured or in-kernel irqchip not
configured as required prior to calling this attribute configured as required prior to calling this attribute
-EBUSY PMUv3 already initialized -EBUSY PMUv3 already initialized or a VCPU has already run
-EINVAL Invalid filter range -EINVAL Invalid filter range
======= ====================================================== ======= ======================================================
@@ -104,11 +104,43 @@ hardware event. Filtering event 0x1E (CHAIN) has no effect either, as it
isn't strictly speaking an event. Filtering the cycle counter is possible isn't strictly speaking an event. Filtering the cycle counter is possible
using event 0x11 (CPU_CYCLES). using event 0x11 (CPU_CYCLES).
1.4 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_SET_PMU
------------------------------------------
:Parameters: in kvm_device_attr.addr the address to an int representing the PMU
identifier.
:Returns:
======= ====================================================
-EBUSY PMUv3 already initialized, a VCPU has already run or
an event filter has already been set
-EFAULT Error accessing the PMU identifier
-ENXIO PMU not found
-ENODEV PMUv3 not supported or GIC not initialized
-ENOMEM Could not allocate memory
======= ====================================================
Request that the VCPU uses the specified hardware PMU when creating guest events
for the purpose of PMU emulation. The PMU identifier can be read from the "type"
file for the desired PMU instance under /sys/devices (or, equivalent,
/sys/bus/even_source). This attribute is particularly useful on heterogeneous
systems where there are at least two CPU PMUs on the system. The PMU that is set
for one VCPU will be used by all the other VCPUs. It isn't possible to set a PMU
if a PMU event filter is already present.
Note that KVM will not make any attempts to run the VCPU on the physical CPUs
associated with the PMU specified by this attribute. This is entirely left to
userspace. However, attempting to run the VCPU on a physical CPU not supported
by the PMU will fail and KVM_RUN will return with
exit_reason = KVM_EXIT_FAIL_ENTRY and populate the fail_entry struct by setting
hardare_entry_failure_reason field to KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED and
the cpu field to the processor id.
2. GROUP: KVM_ARM_VCPU_TIMER_CTRL 2. GROUP: KVM_ARM_VCPU_TIMER_CTRL
================================= =================================
:Architectures: ARM, ARM64 :Architectures: ARM64
2.1. ATTRIBUTES: KVM_ARM_VCPU_TIMER_IRQ_VTIMER, KVM_ARM_VCPU_TIMER_IRQ_PTIMER 2.1. ATTRIBUTES: KVM_ARM_VCPU_TIMER_IRQ_VTIMER, KVM_ARM_VCPU_TIMER_IRQ_PTIMER
----------------------------------------------------------------------------- -----------------------------------------------------------------------------

View File

@@ -42,6 +42,7 @@ descriptions of data structures and algorithms.
ksm ksm
memory-model memory-model
mmu_notifier mmu_notifier
multigen_lru
numa numa
overcommit-accounting overcommit-accounting
page_migration page_migration

View File

@@ -0,0 +1,159 @@
.. SPDX-License-Identifier: GPL-2.0
=============
Multi-Gen LRU
=============
The multi-gen LRU is an alternative LRU implementation that optimizes
page reclaim and improves performance under memory pressure. Page
reclaim decides the kernel's caching policy and ability to overcommit
memory. It directly impacts the kswapd CPU usage and RAM efficiency.
Design overview
===============
Objectives
----------
The design objectives are:
* Good representation of access recency
* Try to profit from spatial locality
* Fast paths to make obvious choices
* Simple self-correcting heuristics
The representation of access recency is at the core of all LRU
implementations. In the multi-gen LRU, each generation represents a
group of pages with similar access recency. Generations establish a
(time-based) common frame of reference and therefore help make better
choices, e.g., between different memcgs on a computer or different
computers in a data center (for job scheduling).
Exploiting spatial locality improves efficiency when gathering the
accessed bit. A rmap walk targets a single page and does not try to
profit from discovering a young PTE. A page table walk can sweep all
the young PTEs in an address space, but the address space can be too
sparse to make a profit. The key is to optimize both methods and use
them in combination.
Fast paths reduce code complexity and runtime overhead. Unmapped pages
do not require TLB flushes; clean pages do not require writeback.
These facts are only helpful when other conditions, e.g., access
recency, are similar. With generations as a common frame of reference,
additional factors stand out. But obvious choices might not be good
choices; thus self-correction is necessary.
The benefits of simple self-correcting heuristics are self-evident.
Again, with generations as a common frame of reference, this becomes
attainable. Specifically, pages in the same generation can be
categorized based on additional factors, and a feedback loop can
statistically compare the refault percentages across those categories
and infer which of them are better choices.
Assumptions
-----------
The protection of hot pages and the selection of cold pages are based
on page access channels and patterns. There are two access channels:
* Accesses through page tables
* Accesses through file descriptors
The protection of the former channel is by design stronger because:
1. The uncertainty in determining the access patterns of the former
channel is higher due to the approximation of the accessed bit.
2. The cost of evicting the former channel is higher due to the TLB
flushes required and the likelihood of encountering the dirty bit.
3. The penalty of underprotecting the former channel is higher because
applications usually do not prepare themselves for major page
faults like they do for blocked I/O. E.g., GUI applications
commonly use dedicated I/O threads to avoid blocking rendering
threads.
There are also two access patterns:
* Accesses exhibiting temporal locality
* Accesses not exhibiting temporal locality
For the reasons listed above, the former channel is assumed to follow
the former pattern unless ``VM_SEQ_READ`` or ``VM_RAND_READ`` is
present, and the latter channel is assumed to follow the latter
pattern unless outlying refaults have been observed.
Workflow overview
=================
Evictable pages are divided into multiple generations for each
``lruvec``. The youngest generation number is stored in
``lrugen->max_seq`` for both anon and file types as they are aged on
an equal footing. The oldest generation numbers are stored in
``lrugen->min_seq[]`` separately for anon and file types as clean file
pages can be evicted regardless of swap constraints. These three
variables are monotonically increasing.
Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)``
bits in order to fit into the gen counter in ``folio->flags``. Each
truncated generation number is an index to ``lrugen->lists[]``. The
sliding window technique is used to track at least ``MIN_NR_GENS`` and
at most ``MAX_NR_GENS`` generations. The gen counter stores a value
within ``[1, MAX_NR_GENS]`` while a page is on one of
``lrugen->lists[]``; otherwise it stores zero.
Each generation is divided into multiple tiers. A page accessed ``N``
times through file descriptors is in tier ``order_base_2(N)``. Unlike
generations, tiers do not have dedicated ``lrugen->lists[]``. In
contrast to moving across generations, which requires the LRU lock,
moving across tiers only involves atomic operations on
``folio->flags`` and therefore has a negligible cost. A feedback loop
modeled after the PID controller monitors refaults over all the tiers
from anon and file types and decides which tiers from which types to
evict or protect.
There are two conceptually independent procedures: the aging and the
eviction. They form a closed-loop system, i.e., the page reclaim.
Aging
-----
The aging produces young generations. Given an ``lruvec``, it
increments ``max_seq`` when ``max_seq-min_seq+1`` approaches
``MIN_NR_GENS``. The aging promotes hot pages to the youngest
generation when it finds them accessed through page tables; the
demotion of cold pages happens consequently when it increments
``max_seq``. The aging uses page table walks and rmap walks to find
young PTEs. For the former, it iterates ``lruvec_memcg()->mm_list``
and calls ``walk_page_range()`` with each ``mm_struct`` on this list
to scan PTEs, and after each iteration, it increments ``max_seq``. For
the latter, when the eviction walks the rmap and finds a young PTE,
the aging scans the adjacent PTEs. For both, on finding a young PTE,
the aging clears the accessed bit and updates the gen counter of the
page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``.
Eviction
--------
The eviction consumes old generations. Given an ``lruvec``, it
increments ``min_seq`` when ``lrugen->lists[]`` indexed by
``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to
evict from, it first compares ``min_seq[]`` to select the older type.
If both types are equally old, it selects the one whose first tier has
a lower refault percentage. The first tier contains single-use
unmapped clean pages, which are the best bet. The eviction sorts a
page according to its gen counter if the aging has found this page
accessed through page tables and updated its gen counter. It also
moves a page to the next generation, i.e., ``min_seq+1``, if this page
was accessed multiple times through file descriptors and the feedback
loop has detected outlying refaults from the tier this page is in. To
this end, the feedback loop uses the first tier as the baseline, for
the reason stated earlier.
Summary
-------
The multi-gen LRU can be disassembled into the following parts:
* Generations
* Rmap walks
* Page table walks
* Bloom filters
* PID controller
The aging and the eviction form a producer-consumer model;
specifically, the latter drives the former by the sliding window over
generations. Within the aging, rmap walks drive page table walks by
inserting hot densely populated page tables to the Bloom filters.
Within the eviction, the PID controller uses refaults as the feedback
to select types to evict and tiers to protect.

View File

@@ -30,3 +30,6 @@ source "lib/Kconfig"
source "lib/Kconfig.debug" source "lib/Kconfig.debug"
source "Documentation/Kconfig" source "Documentation/Kconfig"
# ANDROID: Set KCONFIG_EXT_PREFIX to decend into an external project.
source "$(KCONFIG_EXT_PREFIX)Kconfig.ext"

3
Kconfig.ext Normal file
View File

@@ -0,0 +1,3 @@
# SPDX-License-Identifier: GPL-2.0
# This file is intentionally empty. It's used as a placeholder for when
# KCONFIG_EXT_PREFIX isn't defined.

View File

@@ -2424,7 +2424,7 @@ F: drivers/pci/controller/dwc/pcie-qcom.c
F: drivers/phy/qualcomm/ F: drivers/phy/qualcomm/
F: drivers/power/*/msm* F: drivers/power/*/msm*
F: drivers/reset/reset-qcom-* F: drivers/reset/reset-qcom-*
F: drivers/scsi/ufs/ufs-qcom* F: drivers/ufs/host/ufs-qcom*
F: drivers/spi/spi-geni-qcom.c F: drivers/spi/spi-geni-qcom.c
F: drivers/spi/spi-qcom-qspi.c F: drivers/spi/spi-qcom-qspi.c
F: drivers/spi/spi-qup.c F: drivers/spi/spi-qup.c
@@ -7176,6 +7176,7 @@ M: Chao Yu <chao@kernel.org>
L: linux-f2fs-devel@lists.sourceforge.net L: linux-f2fs-devel@lists.sourceforge.net
S: Maintained S: Maintained
W: https://f2fs.wiki.kernel.org/ W: https://f2fs.wiki.kernel.org/
B: https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs
T: git git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git
F: Documentation/ABI/testing/sysfs-fs-f2fs F: Documentation/ABI/testing/sysfs-fs-f2fs
F: Documentation/filesystems/f2fs.rst F: Documentation/filesystems/f2fs.rst
@@ -9224,6 +9225,13 @@ F: Documentation/hwmon/ina2xx.rst
F: drivers/hwmon/ina2xx.c F: drivers/hwmon/ina2xx.c
F: include/linux/platform_data/ina2xx.h F: include/linux/platform_data/ina2xx.h
INCREMENTAL FILE SYSTEM
M: Paul Lawrence <paullawrence@google.com>
L: linux-unionfs@vger.kernel.org
S: Supported
F: fs/incfs/
F: tools/testing/selftests/filesystems/incfs/
INDUSTRY PACK SUBSYSTEM (IPACK) INDUSTRY PACK SUBSYSTEM (IPACK)
M: Samuel Iglesias Gonsalvez <siglesias@igalia.com> M: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
M: Jens Taprogge <jens.taprogge@taprogge.org> M: Jens Taprogge <jens.taprogge@taprogge.org>
@@ -10136,7 +10144,6 @@ F: arch/*/include/asm/*kasan.h
F: arch/*/mm/kasan_init* F: arch/*/mm/kasan_init*
F: include/linux/kasan*.h F: include/linux/kasan*.h
F: lib/Kconfig.kasan F: lib/Kconfig.kasan
F: lib/test_kasan*.c
F: mm/kasan/ F: mm/kasan/
F: scripts/Makefile.kasan F: scripts/Makefile.kasan
@@ -10305,8 +10312,10 @@ M: Marc Zyngier <maz@kernel.org>
R: James Morse <james.morse@arm.com> R: James Morse <james.morse@arm.com>
R: Alexandru Elisei <alexandru.elisei@arm.com> R: Alexandru Elisei <alexandru.elisei@arm.com>
R: Suzuki K Poulose <suzuki.poulose@arm.com> R: Suzuki K Poulose <suzuki.poulose@arm.com>
R: Oliver Upton <oliver.upton@linux.dev>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
L: kvmarm@lists.cs.columbia.edu (moderated for non-subscribers) L: kvmarm@lists.linux.dev
L: kvmarm@lists.cs.columbia.edu (deprecated, moderated for non-subscribers)
S: Maintained S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git
F: arch/arm64/include/asm/kvm* F: arch/arm64/include/asm/kvm*
@@ -13322,6 +13331,12 @@ W: http://www.netlab.is.tsukuba.ac.jp/~yokota/izumi/ninja/
F: Documentation/scsi/NinjaSCSI.rst F: Documentation/scsi/NinjaSCSI.rst
F: drivers/scsi/nsp32* F: drivers/scsi/nsp32*
NINTENDO HID DRIVER
M: Daniel J. Ogorchock <djogorchock@gmail.com>
L: linux-input@vger.kernel.org
S: Maintained
F: drivers/hid/hid-nintendo*
NIOS2 ARCHITECTURE NIOS2 ARCHITECTURE
M: Dinh Nguyen <dinguyen@kernel.org> M: Dinh Nguyen <dinguyen@kernel.org>
S: Maintained S: Maintained
@@ -16757,6 +16772,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git
T: git git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/mkp/scsi.git
F: Documentation/devicetree/bindings/scsi/ F: Documentation/devicetree/bindings/scsi/
F: drivers/scsi/ F: drivers/scsi/
F: drivers/ufs/
F: include/scsi/ F: include/scsi/
SCSI TAPE DRIVER SCSI TAPE DRIVER
@@ -19312,23 +19328,24 @@ F: include/linux/visorbus.h
UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER
R: Alim Akhtar <alim.akhtar@samsung.com> R: Alim Akhtar <alim.akhtar@samsung.com>
R: Avri Altman <avri.altman@wdc.com> R: Avri Altman <avri.altman@wdc.com>
R: Bart Van Assche <bvanassche@acm.org>
L: linux-scsi@vger.kernel.org L: linux-scsi@vger.kernel.org
S: Supported S: Supported
F: Documentation/scsi/ufs.rst F: Documentation/scsi/ufs.rst
F: drivers/scsi/ufs/ F: drivers/ufs/core/
UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER DWC HOOKS UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER DWC HOOKS
M: Pedro Sousa <pedrom.sousa@synopsys.com> M: Pedro Sousa <pedrom.sousa@synopsys.com>
L: linux-scsi@vger.kernel.org L: linux-scsi@vger.kernel.org
S: Supported S: Supported
F: drivers/scsi/ufs/*dwc* F: drivers/ufs/host/*dwc*
UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER MEDIATEK HOOKS UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER MEDIATEK HOOKS
M: Stanley Chu <stanley.chu@mediatek.com> M: Stanley Chu <stanley.chu@mediatek.com>
L: linux-scsi@vger.kernel.org L: linux-scsi@vger.kernel.org
L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) L: linux-mediatek@lists.infradead.org (moderated for non-subscribers)
S: Maintained S: Maintained
F: drivers/scsi/ufs/ufs-mediatek* F: drivers/ufs/host/ufs-mediatek*
UNSORTED BLOCK IMAGES (UBI) UNSORTED BLOCK IMAGES (UBI)
M: Richard Weinberger <richard@nod.at> M: Richard Weinberger <richard@nod.at>

131
Makefile
View File

@@ -136,6 +136,24 @@ endif
export KBUILD_EXTMOD export KBUILD_EXTMOD
# ANDROID: set up mixed-build support. mixed-build allows device kernel modules
# to be compiled against a GKI kernel. This approach still uses the headers and
# Kbuild from device kernel, so care must be taken to ensure that those headers match.
ifdef KBUILD_MIXED_TREE
# Need vmlinux.symvers for modpost and System.map for depmod, check whether they exist in KBUILD_MIXED_TREE
required_mixed_files=vmlinux.symvers System.map
$(if $(filter-out $(words $(required_mixed_files)), \
$(words $(wildcard $(add-prefix $(KBUILD_MIXED_TREE)/,$(required_mixed_files))))),,\
$(error KBUILD_MIXED_TREE=$(KBUILD_MIXED_TREE) doesn't contain $(required_mixed_files)))
endif
mixed-build-prefix = $(if $(KBUILD_MIXED_TREE),$(KBUILD_MIXED_TREE)/)
export KBUILD_MIXED_TREE
# This is a hack for kleaf to set mixed-build-prefix within the execution of a make rule, e.g.
# within __modinst_pre.
# TODO(b/205893923): Revert this hack once it is properly handled.
export mixed-build-prefix
# Kbuild will save output files in the current working directory. # Kbuild will save output files in the current working directory.
# This does not need to match to the root of the kernel source tree. # This does not need to match to the root of the kernel source tree.
# #
@@ -432,11 +450,12 @@ HOSTCXX = g++
endif endif
HOSTPKG_CONFIG = pkg-config HOSTPKG_CONFIG = pkg-config
export KBUILD_USERCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \ KBUILD_USERHOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \
-O2 -fomit-frame-pointer -std=gnu89 -O2 -fomit-frame-pointer -std=gnu89
export KBUILD_USERLDFLAGS := KBUILD_USERCFLAGS := $(KBUILD_USERHOSTCFLAGS) $(USERCFLAGS)
KBUILD_USERLDFLAGS := $(USERLDFLAGS)
KBUILD_HOSTCFLAGS := $(KBUILD_USERCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS) KBUILD_HOSTCFLAGS := $(KBUILD_USERHOSTCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS)
KBUILD_HOSTCXXFLAGS := -Wall -O2 $(HOST_LFS_CFLAGS) $(HOSTCXXFLAGS) KBUILD_HOSTCXXFLAGS := -Wall -O2 $(HOST_LFS_CFLAGS) $(HOSTCXXFLAGS)
KBUILD_HOSTLDFLAGS := $(HOST_LFS_LDFLAGS) $(HOSTLDFLAGS) KBUILD_HOSTLDFLAGS := $(HOST_LFS_LDFLAGS) $(HOSTLDFLAGS)
KBUILD_HOSTLDLIBS := $(HOST_LFS_LIBS) $(HOSTLDLIBS) KBUILD_HOSTLDLIBS := $(HOST_LFS_LIBS) $(HOSTLDLIBS)
@@ -477,7 +496,7 @@ KGZIP = gzip
KBZIP2 = bzip2 KBZIP2 = bzip2
KLZOP = lzop KLZOP = lzop
LZMA = lzma LZMA = lzma
LZ4 = lz4c LZ4 = lz4
XZ = xz XZ = xz
ZSTD = zstd ZSTD = zstd
@@ -531,6 +550,7 @@ export CPP AR NM STRIP OBJCOPY OBJDUMP READELF PAHOLE RESOLVE_BTFIDS LEX YACC AW
export PERL PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX export PERL PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX
export KGZIP KBZIP2 KLZOP LZMA LZ4 XZ ZSTD export KGZIP KBZIP2 KLZOP LZMA LZ4 XZ ZSTD
export KBUILD_HOSTCXXFLAGS KBUILD_HOSTLDFLAGS KBUILD_HOSTLDLIBS LDFLAGS_MODULE export KBUILD_HOSTCXXFLAGS KBUILD_HOSTLDFLAGS KBUILD_HOSTLDLIBS LDFLAGS_MODULE
export KBUILD_USERCFLAGS KBUILD_USERLDFLAGS
export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS KBUILD_LDFLAGS export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS KBUILD_LDFLAGS
export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE
@@ -672,11 +692,13 @@ drivers-y += virt/
libs-y := lib/ libs-y := lib/
endif # KBUILD_EXTMOD endif # KBUILD_EXTMOD
ifndef KBUILD_MIXED_TREE
# The all: target is the default when no target is given on the # The all: target is the default when no target is given on the
# command line. # command line.
# This allow a user to issue only 'make' to build a kernel including modules # This allow a user to issue only 'make' to build a kernel including modules
# Defaults to vmlinux, but the arch makefile usually adds further targets # Defaults to vmlinux, but the arch makefile usually adds further targets
all: vmlinux all: vmlinux
endif
CFLAGS_GCOV := -fprofile-arcs -ftest-coverage CFLAGS_GCOV := -fprofile-arcs -ftest-coverage
ifdef CONFIG_CC_IS_GCC ifdef CONFIG_CC_IS_GCC
@@ -955,7 +977,13 @@ KBUILD_LDFLAGS += --thinlto-cache-dir=$(extmod_prefix).thinlto-cache
else else
CC_FLAGS_LTO := -flto CC_FLAGS_LTO := -flto
endif endif
ifeq ($(SRCARCH),x86)
# Workaround for compiler / linker bug
CC_FLAGS_LTO += -fvisibility=hidden CC_FLAGS_LTO += -fvisibility=hidden
else
CC_FLAGS_LTO += -fvisibility=default
endif
# Limit inlining across translation units to reduce binary size # Limit inlining across translation units to reduce binary size
KBUILD_LDFLAGS += -mllvm -import-instr-limit=5 KBUILD_LDFLAGS += -mllvm -import-instr-limit=5
@@ -1150,6 +1178,40 @@ export extmod_prefix = $(if $(KBUILD_EXTMOD),$(KBUILD_EXTMOD)/)
export MODORDER := $(extmod_prefix)modules.order export MODORDER := $(extmod_prefix)modules.order
export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps
# ---------------------------------------------------------------------------
# Kernel headers
PHONY += headers
#Default location for installed headers
ifeq ($(KBUILD_EXTMOD),)
PHONY += archheaders archscripts
hdr-inst := -f $(srctree)/scripts/Makefile.headersinst obj
headers: $(version_h) scripts_unifdef uapi-asm-generic archheaders archscripts
else
hdr-prefix = $(KBUILD_EXTMOD)/
hdr-inst := -f $(srctree)/scripts/Makefile.headersinst dst=$(KBUILD_EXTMOD)/usr/include objtree=$(objtree)/$(KBUILD_EXTMOD) obj
endif
export INSTALL_HDR_PATH = $(objtree)/$(hdr-prefix)usr
quiet_cmd_headers_install = INSTALL $(INSTALL_HDR_PATH)/include
cmd_headers_install = \
mkdir -p $(INSTALL_HDR_PATH); \
rsync -mrl --include='*/' --include='*\.h' --exclude='*' \
$(hdr-prefix)usr/include $(INSTALL_HDR_PATH);
PHONY += headers_install
headers_install: headers
$(call cmd,headers_install)
headers:
ifeq ($(KBUILD_EXTMOD),)
$(if $(filter um, $(SRCARCH)), $(error Headers not exportable for UML))
endif
$(Q)$(MAKE) $(hdr-inst)=$(hdr-prefix)include/uapi
$(Q)$(MAKE) $(hdr-inst)=$(hdr-prefix)arch/$(SRCARCH)/include/uapi
ifeq ($(KBUILD_EXTMOD),) ifeq ($(KBUILD_EXTMOD),)
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/
core-$(CONFIG_BLOCK) += block/ core-$(CONFIG_BLOCK) += block/
@@ -1215,8 +1277,10 @@ cmd_link-vmlinux = \
$(CONFIG_SHELL) $< "$(LD)" "$(KBUILD_LDFLAGS)" "$(LDFLAGS_vmlinux)"; \ $(CONFIG_SHELL) $< "$(LD)" "$(KBUILD_LDFLAGS)" "$(LDFLAGS_vmlinux)"; \
$(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true) $(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true)
ifndef KBUILD_MIXED_TREE
vmlinux: scripts/link-vmlinux.sh autoksyms_recursive $(vmlinux-deps) FORCE vmlinux: scripts/link-vmlinux.sh autoksyms_recursive $(vmlinux-deps) FORCE
+$(call if_changed_dep,link-vmlinux) +$(call if_changed_dep,link-vmlinux)
endif
targets := vmlinux targets := vmlinux
@@ -1225,7 +1289,8 @@ targets := vmlinux
$(sort $(vmlinux-deps) $(subdir-modorder)): descend ; $(sort $(vmlinux-deps) $(subdir-modorder)): descend ;
filechk_kernel.release = \ filechk_kernel.release = \
echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))" echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion \
$(srctree) $(BRANCH) $(KMI_GENERATION))"
# Store (new) KERNELRELEASE string in include/config/kernel.release # Store (new) KERNELRELEASE string in include/config/kernel.release
include/config/kernel.release: FORCE include/config/kernel.release: FORCE
@@ -1315,32 +1380,6 @@ headerdep:
$(Q)find $(srctree)/include/ -name '*.h' | xargs --max-args 1 \ $(Q)find $(srctree)/include/ -name '*.h' | xargs --max-args 1 \
$(srctree)/scripts/headerdep.pl -I$(srctree)/include $(srctree)/scripts/headerdep.pl -I$(srctree)/include
# ---------------------------------------------------------------------------
# Kernel headers
#Default location for installed headers
export INSTALL_HDR_PATH = $(objtree)/usr
quiet_cmd_headers_install = INSTALL $(INSTALL_HDR_PATH)/include
cmd_headers_install = \
mkdir -p $(INSTALL_HDR_PATH); \
rsync -mrl --include='*/' --include='*\.h' --exclude='*' \
usr/include $(INSTALL_HDR_PATH)
PHONY += headers_install
headers_install: headers
$(call cmd,headers_install)
PHONY += archheaders archscripts
hdr-inst := -f $(srctree)/scripts/Makefile.headersinst obj
PHONY += headers
headers: $(version_h) scripts_unifdef uapi-asm-generic archheaders archscripts
$(if $(filter um, $(SRCARCH)), $(error Headers not exportable for UML))
$(Q)$(MAKE) $(hdr-inst)=include/uapi
$(Q)$(MAKE) $(hdr-inst)=arch/$(SRCARCH)/include/uapi
# Deprecated. It is no-op now. # Deprecated. It is no-op now.
PHONY += headers_check PHONY += headers_check
headers_check: headers_check:
@@ -1426,7 +1465,9 @@ kselftest-merge:
# Devicetree files # Devicetree files
ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/boot/dts/),) ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/boot/dts/),)
dtstree := arch/$(SRCARCH)/boot/dts # ANDROID: allow this to be overridden by the build environment. This allows
# one to compile a device tree that is located out-of-tree.
dtstree ?= arch/$(SRCARCH)/boot/dts
endif endif
ifneq ($(dtstree),) ifneq ($(dtstree),)
@@ -1492,7 +1533,9 @@ endif
# using awk while concatenating to the final file. # using awk while concatenating to the final file.
PHONY += modules PHONY += modules
modules: $(if $(KBUILD_BUILTIN),vmlinux) modules_check modules_prepare # if KBUILD_BUILTIN && !KBUILD_MIXED_TREE, depend on vmlinux
modules: $(if $(KBUILD_BUILTIN), $(if $(KBUILD_MIXED_TREE),,vmlinux))
modules: modules_check modules_prepare
cmd_modules_order = $(AWK) '!x[$$0]++' $(real-prereqs) > $@ cmd_modules_order = $(AWK) '!x[$$0]++' $(real-prereqs) > $@
@@ -1537,8 +1580,8 @@ __modinst_pre:
ln -s $(CURDIR) $(MODLIB)/build ; \ ln -s $(CURDIR) $(MODLIB)/build ; \
fi fi
@sed 's:^:kernel/:' modules.order > $(MODLIB)/modules.order @sed 's:^:kernel/:' modules.order > $(MODLIB)/modules.order
@cp -f modules.builtin $(MODLIB)/ @cp -f $(mixed-build-prefix)modules.builtin $(MODLIB)/
@cp -f $(objtree)/modules.builtin.modinfo $(MODLIB)/ @cp -f $(or $(mixed-build-prefix),$(objtree)/)modules.builtin.modinfo $(MODLIB)/
endif # CONFIG_MODULES endif # CONFIG_MODULES
@@ -1799,6 +1842,8 @@ help:
@echo '' @echo ''
@echo ' modules - default target, build the module(s)' @echo ' modules - default target, build the module(s)'
@echo ' modules_install - install the module' @echo ' modules_install - install the module'
@echo ' headers_install - Install sanitised kernel headers to INSTALL_HDR_PATH'
@echo ' (default: $(abspath $(INSTALL_HDR_PATH)))'
@echo ' clean - remove generated files in module directory only' @echo ' clean - remove generated files in module directory only'
@echo '' @echo ''
@@ -1823,7 +1868,7 @@ modules_check: $(MODORDER)
quiet_cmd_depmod = DEPMOD $(MODLIB) quiet_cmd_depmod = DEPMOD $(MODLIB)
cmd_depmod = $(CONFIG_SHELL) $(srctree)/scripts/depmod.sh $(DEPMOD) \ cmd_depmod = $(CONFIG_SHELL) $(srctree)/scripts/depmod.sh $(DEPMOD) \
$(KERNELRELEASE) $(KERNELRELEASE) $(mixed-build-prefix)
modules_install: modules_install:
$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.modinst
@@ -1860,7 +1905,8 @@ ifdef single-build
# .ko is special because modpost is needed # .ko is special because modpost is needed
single-ko := $(sort $(filter %.ko, $(MAKECMDGOALS))) single-ko := $(sort $(filter %.ko, $(MAKECMDGOALS)))
single-no-ko := $(sort $(patsubst %.ko,%.mod, $(MAKECMDGOALS))) single-no-ko := $(filter-out $(single-ko), $(MAKECMDGOALS)) \
$(foreach x, o mod, $(patsubst %.ko, %.$x, $(single-ko)))
$(single-ko): single_modpost $(single-ko): single_modpost
@: @:
@@ -1902,7 +1948,7 @@ descend: $(build-dirs)
$(build-dirs): prepare $(build-dirs): prepare
$(Q)$(MAKE) $(build)=$@ \ $(Q)$(MAKE) $(build)=$@ \
single-build=$(if $(filter-out $@/, $(filter $@/%, $(KBUILD_SINGLE_TARGETS))),1) \ single-build=$(if $(filter-out $@/, $(filter $@/%, $(KBUILD_SINGLE_TARGETS))),1) \
need-builtin=1 need-modorder=1 $(if $(KBUILD_MIXED_TREE),,need-builtin=1) need-modorder=1
clean-dirs := $(addprefix _clean_, $(clean-dirs)) clean-dirs := $(addprefix _clean_, $(clean-dirs))
PHONY += $(clean-dirs) clean PHONY += $(clean-dirs) clean
@@ -1911,12 +1957,14 @@ $(clean-dirs):
clean: $(clean-dirs) clean: $(clean-dirs)
$(call cmd,rmfiles) $(call cmd,rmfiles)
@find $(if $(KBUILD_EXTMOD), $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \ @find $(if $(KBUILD_EXTMOD), $(KBUILD_EXTMOD), .) \
$(if $(filter-out arch/$(SRCARCH)/boot/dts, $(dtstree)), $(dtstree)) \
$(RCS_FIND_IGNORE) \
\( -name '*.[aios]' -o -name '*.ko' -o -name '.*.cmd' \ \( -name '*.[aios]' -o -name '*.ko' -o -name '.*.cmd' \
-o -name '*.ko.*' \ -o -name '*.ko.*' \
-o -name '*.dtb' -o -name '*.dtbo' -o -name '*.dtb.S' -o -name '*.dt.yaml' \ -o -name '*.dtb' -o -name '*.dtbo' -o -name '*.dtb.S' -o -name '*.dt.yaml' \
-o -name '*.dwo' -o -name '*.lst' \ -o -name '*.dwo' -o -name '*.lst' \
-o -name '*.su' -o -name '*.mod' \ -o -name '*.su' -o -name '*.mod' -o -name '*.usyms' \
-o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \ -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
-o -name '*.lex.c' -o -name '*.tab.[ch]' \ -o -name '*.lex.c' -o -name '*.tab.[ch]' \
-o -name '*.asn1.[ch]' \ -o -name '*.asn1.[ch]' \
@@ -2007,7 +2055,8 @@ checkstack:
$(PERL) $(srctree)/scripts/checkstack.pl $(CHECKSTACK_ARCH) $(PERL) $(srctree)/scripts/checkstack.pl $(CHECKSTACK_ARCH)
kernelrelease: kernelrelease:
@echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion $(srctree))" @echo "$(KERNELVERSION)$$($(CONFIG_SHELL) $(srctree)/scripts/setlocalversion \
$(srctree) $(BRANCH) $(KMI_GENERATION))"
kernelversion: kernelversion:
@echo $(KERNELVERSION) @echo $(KERNELVERSION)

2
OWNERS Normal file
View File

@@ -0,0 +1,2 @@
# include OWNERS from the authoritative android-mainline branch
include kernel/common:android-mainline:/OWNERS

8
android/OWNERS Normal file
View File

@@ -0,0 +1,8 @@
# If we ever add another OWNERS above this directory, it's likely to be
# more permissive, so don't inherit from it
set noparent
include kernel/common:android-mainline:/OWNERS_DrNo
# Downstream boards maintained directly in this manifest branch
per-file abi_gki_aarch64_cuttlefish = adelva@google.com, rammuthiah@google.com
per-file abi_gki_aarch64_goldfish = rkir@google.com

4
android/abi_gki_aarch64 Normal file
View File

@@ -0,0 +1,4 @@
[abi_symbol_list]
# commonly used symbols
module_layout
__put_task_struct

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,517 @@
__cfg80211_alloc_event_skb
__cfg80211_alloc_reply_skb
__cfg80211_radar_event
__cfg80211_send_event_skb
__hci_cmd_send
__hci_cmd_sync
__hci_cmd_sync_ev
__nfc_alloc_vendor_cmd_reply_skb
alloc_can_err_skb
alloc_can_skb
alloc_candev_mqs
alloc_canfd_skb
arc4_crypt
arc4_setkey
baswap
bridge_tunnel_header
bt_accept_dequeue
bt_accept_enqueue
bt_accept_unlink
bt_debugfs
bt_err
bt_err_ratelimited
bt_info
bt_procfs_cleanup
bt_procfs_init
bt_sock_ioctl
bt_sock_link
bt_sock_poll
bt_sock_reclassify_lock
bt_sock_recvmsg
bt_sock_register
bt_sock_stream_recvmsg
bt_sock_unlink
bt_sock_unregister
bt_sock_wait_ready
bt_sock_wait_state
bt_to_errno
bt_warn
bt_warn_ratelimited
btbcm_check_bdaddr
btbcm_finalize
btbcm_initialize
btbcm_patchram
btbcm_read_pcm_int_params
btbcm_set_bdaddr
btbcm_setup_apple
btbcm_setup_patchram
btbcm_write_pcm_int_params
can_bus_off
can_change_mtu
can_change_state
can_fd_dlc2len
can_fd_len2dlc
can_free_echo_skb
can_get_echo_skb
can_get_state_str
can_proto_register
can_proto_unregister
can_put_echo_skb
can_rx_offload_add_fifo
can_rx_offload_add_manual
can_rx_offload_add_timestamp
can_rx_offload_del
can_rx_offload_enable
can_rx_offload_get_echo_skb
can_rx_offload_irq_finish
can_rx_offload_irq_offload_fifo
can_rx_offload_irq_offload_timestamp
can_rx_offload_queue_sorted
can_rx_offload_queue_tail
can_rx_offload_threaded_irq_finish
can_rx_register
can_rx_unregister
can_send
can_skb_get_frame_len
can_sock_destruct
cfg80211_any_usable_channels
cfg80211_assoc_comeback
cfg80211_assoc_failure
cfg80211_auth_timeout
cfg80211_background_cac_abort
cfg80211_bss_color_notify
cfg80211_bss_flush
cfg80211_bss_iter
cfg80211_cac_event
cfg80211_calculate_bitrate
cfg80211_ch_switch_notify
cfg80211_ch_switch_started_notify
cfg80211_chandef_compatible
cfg80211_chandef_create
cfg80211_chandef_dfs_required
cfg80211_chandef_usable
cfg80211_chandef_valid
cfg80211_check_combinations
cfg80211_check_station_change
cfg80211_classify8021d
cfg80211_conn_failed
cfg80211_connect_done
cfg80211_control_port_tx_status
cfg80211_cqm_beacon_loss_notify
cfg80211_cqm_pktloss_notify
cfg80211_cqm_rssi_notify
cfg80211_cqm_txe_notify
cfg80211_crit_proto_stopped
cfg80211_del_sta_sinfo
cfg80211_disconnected
cfg80211_external_auth_request
cfg80211_find_elem_match
cfg80211_find_vendor_elem
cfg80211_free_nan_func
cfg80211_ft_event
cfg80211_get_bss
cfg80211_get_drvinfo
cfg80211_get_iftype_ext_capa
cfg80211_get_p2p_attr
cfg80211_get_station
cfg80211_gtk_rekey_notify
cfg80211_ibss_joined
cfg80211_iftype_allowed
cfg80211_inform_bss_data
cfg80211_inform_bss_frame_data
cfg80211_is_element_inherited
cfg80211_iter_combinations
cfg80211_merge_profile
cfg80211_mgmt_tx_status_ext
cfg80211_michael_mic_failure
cfg80211_nan_func_terminated
cfg80211_nan_match
cfg80211_new_sta
cfg80211_notify_new_peer_candidate
cfg80211_pmksa_candidate_notify
cfg80211_pmsr_complete
cfg80211_pmsr_report
cfg80211_port_authorized
cfg80211_probe_status
cfg80211_put_bss
cfg80211_ready_on_channel
cfg80211_ref_bss
cfg80211_reg_can_beacon
cfg80211_reg_can_beacon_relax
cfg80211_register_netdevice
cfg80211_remain_on_channel_expired
cfg80211_report_obss_beacon_khz
cfg80211_report_wowlan_wakeup
cfg80211_roamed
cfg80211_rx_assoc_resp
cfg80211_rx_control_port
cfg80211_rx_mgmt_ext
cfg80211_rx_mlme_mgmt
cfg80211_rx_spurious_frame
cfg80211_rx_unexpected_4addr_frame
cfg80211_rx_unprot_mlme_mgmt
cfg80211_scan_done
cfg80211_sched_scan_results
cfg80211_sched_scan_stopped
cfg80211_sched_scan_stopped_locked
cfg80211_send_layer2_update
cfg80211_shutdown_all_interfaces
cfg80211_sinfo_alloc_tid_stats
cfg80211_sta_opmode_change_notify
cfg80211_stop_iface
cfg80211_tdls_oper_request
cfg80211_tx_mgmt_expired
cfg80211_tx_mlme_mgmt
cfg80211_unlink_bss
cfg80211_unregister_wdev
cfg80211_update_owe_info_event
cfg80211_vendor_cmd_get_sender
cfg80211_vendor_cmd_reply
close_candev
free_candev
freq_reg_info
get_wiphy_regdom
h4_recv_buf
hci_alloc_dev_priv
hci_cmd_sync
hci_conn_check_secure
hci_conn_security
hci_conn_switch_role
hci_free_dev
hci_get_route
hci_mgmt_chan_register
hci_mgmt_chan_unregister
hci_recv_diag
hci_recv_frame
hci_register_cb
hci_register_dev
hci_release_dev
hci_reset_dev
hci_resume_dev
hci_set_fw_info
hci_set_hw_info
hci_suspend_dev
hci_uart_register_device
hci_uart_tx_wakeup
hci_uart_unregister_device
hci_unregister_cb
hci_unregister_dev
hidp_hid_driver
ieee80211_alloc_hw_nm
ieee80211_amsdu_to_8023s
ieee80211_ap_probereq_get
ieee80211_ave_rssi
ieee80211_beacon_cntdwn_is_complete
ieee80211_beacon_get_template
ieee80211_beacon_get_tim
ieee80211_beacon_loss
ieee80211_beacon_set_cntdwn
ieee80211_beacon_update_cntdwn
ieee80211_bss_get_elem
ieee80211_calc_rx_airtime
ieee80211_calc_tx_airtime
ieee80211_chandef_to_operating_class
ieee80211_channel_to_freq_khz
ieee80211_chswitch_done
ieee80211_color_change_finish
ieee80211_connection_loss
ieee80211_cqm_beacon_loss_notify
ieee80211_cqm_rssi_notify
ieee80211_csa_finish
ieee80211_ctstoself_duration
ieee80211_ctstoself_get
ieee80211_data_to_8023_exthdr
ieee80211_disable_rssi_reports
ieee80211_disconnect
ieee80211_enable_rssi_reports
ieee80211_find_sta
ieee80211_find_sta_by_ifaddr
ieee80211_free_hw
ieee80211_free_txskb
ieee80211_freq_khz_to_channel
ieee80211_generic_frame_duration
ieee80211_get_bssid
ieee80211_get_buffered_bc
ieee80211_get_channel_khz
ieee80211_get_fils_discovery_tmpl
ieee80211_get_hdrlen_from_skb
ieee80211_get_key_rx_seq
ieee80211_get_mesh_hdrlen
ieee80211_get_num_supported_channels
ieee80211_get_response_rate
ieee80211_get_tkip_p1k_iv
ieee80211_get_tkip_p2k
ieee80211_get_tkip_rx_p1k
ieee80211_get_tx_rates
ieee80211_get_unsol_bcast_probe_resp_tmpl
ieee80211_get_vht_max_nss
ieee80211_gtk_rekey_add
ieee80211_gtk_rekey_notify
ieee80211_hdrlen
ieee80211_ie_split_ric
ieee80211_iter_chan_contexts_atomic
ieee80211_iter_keys
ieee80211_iter_keys_rcu
ieee80211_iterate_active_interfaces_atomic
ieee80211_iterate_active_interfaces_mtx
ieee80211_iterate_interfaces
ieee80211_iterate_stations_atomic
ieee80211_key_mic_failure
ieee80211_key_replay
ieee80211_manage_rx_ba_offl
ieee80211_mandatory_rates
ieee80211_mark_rx_ba_filtered_frames
ieee80211_nan_func_match
ieee80211_nan_func_terminated
ieee80211_next_txq
ieee80211_nullfunc_get
ieee80211_operating_class_to_band
ieee80211_parse_p2p_noa
ieee80211_probereq_get
ieee80211_proberesp_get
ieee80211_pspoll_get
ieee80211_queue_delayed_work
ieee80211_queue_stopped
ieee80211_queue_work
ieee80211_radar_detected
ieee80211_radiotap_iterator_init
ieee80211_radiotap_iterator_next
ieee80211_rate_control_register
ieee80211_rate_control_unregister
ieee80211_ready_on_channel
ieee80211_register_hw
ieee80211_remain_on_channel_expired
ieee80211_remove_key
ieee80211_report_low_ack
ieee80211_report_wowlan_wakeup
ieee80211_request_smps
ieee80211_reserve_tid
ieee80211_restart_hw
ieee80211_resume_disconnect
ieee80211_return_txq
ieee80211_rts_duration
ieee80211_rts_get
ieee80211_rx_ba_timer_expired
ieee80211_rx_irqsafe
ieee80211_rx_list
ieee80211_rx_napi
ieee80211_s1g_channel_width
ieee80211_scan_completed
ieee80211_sched_scan_results
ieee80211_sched_scan_stopped
ieee80211_schedule_txq
ieee80211_send_bar
ieee80211_send_eosp_nullfunc
ieee80211_set_key_rx_seq
ieee80211_sta_block_awake
ieee80211_sta_eosp
ieee80211_sta_ps_transition
ieee80211_sta_pspoll
ieee80211_sta_register_airtime
ieee80211_sta_set_buffered
ieee80211_sta_uapsd_trigger
ieee80211_start_tx_ba_cb_irqsafe
ieee80211_start_tx_ba_session
ieee80211_stop_queue
ieee80211_stop_queues
ieee80211_stop_rx_ba_session
ieee80211_stop_tx_ba_cb_irqsafe
ieee80211_stop_tx_ba_session
ieee80211_tdls_oper_request
ieee80211_tkip_add_iv
ieee80211_tx_dequeue
ieee80211_tx_prepare_skb
ieee80211_tx_rate_update
ieee80211_tx_status
ieee80211_tx_status_8023
ieee80211_tx_status_ext
ieee80211_tx_status_irqsafe
ieee80211_txq_airtime_check
ieee80211_txq_get_depth
ieee80211_txq_may_transmit
ieee80211_txq_schedule_start
ieee80211_unregister_hw
ieee80211_unreserve_tid
ieee80211_update_mu_groups
ieee80211_update_p2p_noa
ieee80211_vif_to_wdev
ieee80211_wake_queue
ieee80211_wake_queues
ieee802154_alloc_hw
ieee802154_free_hw
ieee802154_hdr_peek
ieee802154_hdr_peek_addrs
ieee802154_hdr_pull
ieee802154_hdr_push
ieee802154_max_payload
ieee802154_register_hw
ieee802154_rx_irqsafe
ieee802154_stop_queue
ieee802154_unregister_hw
ieee802154_wake_queue
ieee802154_xmit_complete
ieeee80211_obss_color_collision_notify
l2cap_add_psm
l2cap_chan_close
l2cap_chan_connect
l2cap_chan_create
l2cap_chan_del
l2cap_chan_list
l2cap_chan_put
l2cap_chan_send
l2cap_chan_set_defaults
l2cap_conn_get
l2cap_conn_put
l2cap_is_socket
l2cap_register_user
l2cap_unregister_user
l2tp_recv_common
l2tp_session_create
l2tp_session_dec_refcount
l2tp_session_delete
l2tp_session_get
l2tp_session_get_by_ifname
l2tp_session_get_nth
l2tp_session_inc_refcount
l2tp_session_register
l2tp_session_set_header_len
l2tp_sk_to_tunnel
l2tp_tunnel_create
l2tp_tunnel_dec_refcount
l2tp_tunnel_delete
l2tp_tunnel_get
l2tp_tunnel_get_nth
l2tp_tunnel_get_session
l2tp_tunnel_inc_refcount
l2tp_tunnel_register
l2tp_udp_encap_recv
l2tp_xmit_skb
lowpan_header_compress
lowpan_header_decompress
lowpan_nhc_add
lowpan_nhc_del
lowpan_register_netdev
lowpan_register_netdevice
lowpan_unregister_netdev
lowpan_unregister_netdevice
nfc_add_se
nfc_alloc_recv_skb
nfc_allocate_device
nfc_class
nfc_dep_link_is_up
nfc_driver_failure
nfc_find_se
nfc_fw_download_done
nfc_get_local_general_bytes
nfc_proto_register
nfc_proto_unregister
nfc_register_device
nfc_remove_se
nfc_se_connectivity
nfc_se_transaction
nfc_send_to_raw_sock
nfc_set_remote_general_bytes
nfc_target_lost
nfc_targets_found
nfc_tm_activated
nfc_tm_data_received
nfc_tm_deactivated
nfc_unregister_device
nfc_vendor_cmd_reply
of_can_transceiver
open_candev
ppp_channel_index
ppp_dev_name
ppp_input
ppp_input_error
ppp_output_wakeup
ppp_register_channel
ppp_register_compressor
ppp_register_net_channel
ppp_unit_number
ppp_unregister_channel
ppp_unregister_compressor
pppox_compat_ioctl
pppox_ioctl
pppox_unbind_sock
qca_read_soc_version
qca_send_pre_shutdown_cmd
qca_set_bdaddr
qca_set_bdaddr_rome
qca_uart_setup
rate_control_set_rates
reg_initiator_name
reg_query_regdb_wmm
register_candev
register_pppox_proto
regulatory_hint
regulatory_pre_cac_allowed
regulatory_set_wiphy_regd
regulatory_set_wiphy_regd_sync
rfc1042_header
rfkill_alloc
rfkill_blocked
rfkill_destroy
rfkill_find_type
rfkill_get_led_trigger_name
rfkill_init_sw_state
rfkill_pause_polling
rfkill_register
rfkill_resume_polling
rfkill_set_hw_state_reason
rfkill_set_led_trigger_name
rfkill_set_states
rfkill_set_sw_state
rfkill_unregister
safe_candev_priv
slhc_compress
slhc_free
slhc_init
slhc_remember
slhc_toss
slhc_uncompress
tipc_dump_done
tipc_dump_start
tipc_nl_sk_walk
tipc_sk_fill_sock_diag
unregister_candev
unregister_pppox_proto
usb_serial_claim_interface
usb_serial_deregister_drivers
usb_serial_generic_chars_in_buffer
usb_serial_generic_close
usb_serial_generic_get_icount
usb_serial_generic_open
usb_serial_generic_process_read_urb
usb_serial_generic_read_bulk_callback
usb_serial_generic_resume
usb_serial_generic_submit_read_urbs
usb_serial_generic_throttle
usb_serial_generic_tiocmiwait
usb_serial_generic_unthrottle
usb_serial_generic_wait_until_sent
usb_serial_generic_write
usb_serial_generic_write_bulk_callback
usb_serial_generic_write_start
usb_serial_handle_dcd_change
usb_serial_port_softint
usb_serial_register_drivers
usb_serial_resume
usb_serial_suspend
wdev_chandef
wdev_to_ieee80211_vif
wiphy_apply_custom_regulatory
wiphy_free
wiphy_new_nm
wiphy_read_of_freq_limits
wiphy_register
wiphy_rfkill_set_hw_state_reason
wiphy_rfkill_start_polling
wiphy_to_ieee80211_hw
wiphy_unregister
wpan_phy_find
wpan_phy_for_each
wpan_phy_free
wpan_phy_new
wpan_phy_register
wpan_phy_unregister

4
android/abi_gki_rockpi4 Normal file
View File

@@ -0,0 +1,4 @@
[abi_symbol_list]
# commonly used symbols
module_layout
__put_task_struct

View File

@@ -0,0 +1,48 @@
mm/zsmalloc.ko
drivers/block/zram/zram.ko
drivers/net/can/dev/can-dev.ko
drivers/net/can/vcan.ko
drivers/net/can/slcan.ko
drivers/net/ppp/ppp_generic.ko
drivers/net/ppp/bsd_comp.ko
drivers/net/ppp/ppp_deflate.ko
drivers/net/ppp/ppp_mppe.ko
drivers/net/ppp/pppox.ko
drivers/net/ppp/pptp.ko
drivers/net/slip/slhc.ko
drivers/usb/class/cdc-acm.ko
drivers/usb/serial/usbserial.ko
drivers/usb/serial/ftdi_sio.ko
drivers/bluetooth/hci_uart.ko
drivers/bluetooth/btsdio.ko
drivers/bluetooth/btbcm.ko
drivers/bluetooth/btqca.ko
net/8021q/8021q.ko
net/wireless/cfg80211.ko
net/can/can.ko
net/can/can-raw.ko
net/can/can-bcm.ko
net/can/can-gw.ko
net/bluetooth/bluetooth.ko
net/bluetooth/rfcomm/rfcomm.ko
net/bluetooth/hidp/hidp.ko
net/l2tp/l2tp_core.ko
net/l2tp/l2tp_ppp.ko
net/mac80211/mac80211.ko
net/tipc/tipc.ko
net/tipc/diag.ko
net/rfkill/rfkill.ko
net/6lowpan/6lowpan.ko
net/6lowpan/nhc_dest.ko
net/6lowpan/nhc_fragment.ko
net/6lowpan/nhc_hop.ko
net/6lowpan/nhc_ipv6.ko
net/6lowpan/nhc_mobility.ko
net/6lowpan/nhc_routing.ko
net/6lowpan/nhc_udp.ko
net/ieee802154/6lowpan/ieee802154_6lowpan.ko
net/ieee802154/ieee802154.ko
net/ieee802154/ieee802154_socket.ko
net/mac802154/mac802154.ko
net/nfc/nfc.ko
lib/crypto/libarc4.ko

View File

@@ -0,0 +1,47 @@
drivers/bluetooth/btbcm.ko
drivers/bluetooth/btqca.ko
drivers/bluetooth/btsdio.ko
drivers/bluetooth/hci_uart.ko
drivers/net/can/dev/can-dev.ko
drivers/net/can/slcan.ko
drivers/net/can/vcan.ko
drivers/net/ppp/bsd_comp.ko
drivers/net/ppp/ppp_deflate.ko
drivers/net/ppp/ppp_generic.ko
drivers/net/ppp/ppp_mppe.ko
drivers/net/ppp/pppox.ko
drivers/net/ppp/pptp.ko
drivers/net/slip/slhc.ko
drivers/usb/class/cdc-acm.ko
drivers/usb/serial/ftdi_sio.ko
drivers/usb/serial/usbserial.ko
lib/crypto/libarc4.ko
net/6lowpan/6lowpan.ko
net/6lowpan/nhc_dest.ko
net/6lowpan/nhc_fragment.ko
net/6lowpan/nhc_hop.ko
net/6lowpan/nhc_ipv6.ko
net/6lowpan/nhc_mobility.ko
net/6lowpan/nhc_routing.ko
net/6lowpan/nhc_udp.ko
net/8021q/8021q.ko
net/bluetooth/bluetooth.ko
net/bluetooth/hidp/hidp.ko
net/bluetooth/rfcomm/rfcomm.ko
net/can/can.ko
net/can/can-bcm.ko
net/can/can-gw.ko
net/can/can-raw.ko
net/ieee802154/6lowpan/ieee802154_6lowpan.ko
net/ieee802154/ieee802154.ko
net/ieee802154/ieee802154_socket.ko
net/l2tp/l2tp_core.ko
net/l2tp/l2tp_ppp.ko
net/mac80211/mac80211.ko
net/mac802154/mac802154.ko
net/nfc/nfc.ko
net/rfkill/rfkill.ko
net/tipc/diag.ko
net/tipc/tipc.ko
net/wireless/cfg80211.ko

View File

@@ -0,0 +1,49 @@
drivers/block/zram/zram.ko
drivers/bluetooth/btbcm.ko
drivers/bluetooth/btqca.ko
drivers/bluetooth/btsdio.ko
drivers/bluetooth/hci_uart.ko
drivers/net/can/dev/can-dev.ko
drivers/net/can/slcan.ko
drivers/net/can/vcan.ko
drivers/net/ppp/bsd_comp.ko
drivers/net/ppp/ppp_deflate.ko
drivers/net/ppp/ppp_generic.ko
drivers/net/ppp/ppp_mppe.ko
drivers/net/ppp/pppox.ko
drivers/net/ppp/pptp.ko
drivers/net/slip/slhc.ko
drivers/usb/class/cdc-acm.ko
drivers/usb/serial/ftdi_sio.ko
drivers/usb/serial/usbserial.ko
lib/crypto/libarc4.ko
mm/zsmalloc.ko
net/6lowpan/6lowpan.ko
net/6lowpan/nhc_dest.ko
net/6lowpan/nhc_fragment.ko
net/6lowpan/nhc_hop.ko
net/6lowpan/nhc_ipv6.ko
net/6lowpan/nhc_mobility.ko
net/6lowpan/nhc_routing.ko
net/6lowpan/nhc_udp.ko
net/8021q/8021q.ko
net/bluetooth/bluetooth.ko
net/bluetooth/hidp/hidp.ko
net/bluetooth/rfcomm/rfcomm.ko
net/can/can.ko
net/can/can-bcm.ko
net/can/can-gw.ko
net/can/can-raw.ko
net/ieee802154/6lowpan/ieee802154_6lowpan.ko
net/ieee802154/ieee802154.ko
net/ieee802154/ieee802154_socket.ko
net/l2tp/l2tp_core.ko
net/l2tp/l2tp_ppp.ko
net/mac80211/mac80211.ko
net/mac802154/mac802154.ko
net/nfc/nfc.ko
net/rfkill/rfkill.ko
net/tipc/diag.ko
net/tipc/tipc.ko
net/wireless/cfg80211.ko

View File

@@ -24,6 +24,13 @@ config KEXEC_ELF
config HAVE_IMA_KEXEC config HAVE_IMA_KEXEC
bool bool
config ARCH_HAS_SUBPAGE_FAULTS
bool
help
Select if the architecture can check permissions at sub-page
granularity (e.g. arm64 MTE). The probe_user_*() functions
must be implemented.
config SET_FS config SET_FS
bool bool
@@ -713,10 +720,7 @@ config ARCH_SUPPORTS_CFI_CLANG
config CFI_CLANG config CFI_CLANG
bool "Use Clang's Control Flow Integrity (CFI)" bool "Use Clang's Control Flow Integrity (CFI)"
depends on LTO_CLANG && ARCH_SUPPORTS_CFI_CLANG depends on LTO_CLANG && ARCH_SUPPORTS_CFI_CLANG
# Clang >= 12: depends on CLANG_VERSION >= 140000
# - https://bugs.llvm.org/show_bug.cgi?id=46258
# - https://bugs.llvm.org/show_bug.cgi?id=47479
depends on CLANG_VERSION >= 120000
select KALLSYMS select KALLSYMS
help help
This option enables Clangs forward-edge Control Flow Integrity This option enables Clangs forward-edge Control Flow Integrity
@@ -1238,6 +1242,9 @@ config RELR
config ARCH_HAS_MEM_ENCRYPT config ARCH_HAS_MEM_ENCRYPT
bool bool
config ARCH_HAS_MEM_RELINQUISH
bool
config ARCH_HAS_CC_PLATFORM config ARCH_HAS_CC_PLATFORM
bool bool
@@ -1295,6 +1302,17 @@ config ARCH_HAS_ELFCORE_COMPAT
config ARCH_HAS_PARANOID_L1D_FLUSH config ARCH_HAS_PARANOID_L1D_FLUSH
bool bool
config ARCH_HAVE_TRACE_MMIO_ACCESS
bool
config ARCH_HAS_NONLEAF_PMD_YOUNG
bool
help
Architectures that select this option are capable of setting the
accessed bit in non-leaf PMD entries when using them as part of linear
address translations. Page table walkers that clear the accessed bit
may use this capability to reduce their search space.
source "kernel/gcov/Kconfig" source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig" source "scripts/gcc-plugins/Kconfig"

View File

@@ -54,6 +54,7 @@ config ARM
select GENERIC_ATOMIC64 if CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI select GENERIC_ATOMIC64 if CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI
select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_CLOCKEVENTS_BROADCAST if SMP
select GENERIC_IRQ_IPI if SMP select GENERIC_IRQ_IPI if SMP
select ARCH_WANTS_IRQ_RAW if GENERIC_IRQ_IPI
select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_AUTOPROBE
select GENERIC_EARLY_IOREMAP select GENERIC_EARLY_IOREMAP
select GENERIC_IDLE_POLL_SETUP select GENERIC_IDLE_POLL_SETUP

1
arch/arm/OWNERS Normal file
View File

@@ -0,0 +1 @@
include ../arm64/OWNERS

View File

@@ -77,10 +77,10 @@ CPPFLAGS_vmlinux.lds += -DTEXT_OFFSET="$(TEXT_OFFSET)"
CPPFLAGS_vmlinux.lds += -DMALLOC_SIZE="$(MALLOC_SIZE)" CPPFLAGS_vmlinux.lds += -DMALLOC_SIZE="$(MALLOC_SIZE)"
compress-$(CONFIG_KERNEL_GZIP) = gzip compress-$(CONFIG_KERNEL_GZIP) = gzip
compress-$(CONFIG_KERNEL_LZO) = lzo compress-$(CONFIG_KERNEL_LZO) = lzo_with_size
compress-$(CONFIG_KERNEL_LZMA) = lzma compress-$(CONFIG_KERNEL_LZMA) = lzma_with_size
compress-$(CONFIG_KERNEL_XZ) = xzkern compress-$(CONFIG_KERNEL_XZ) = xzkern_with_size
compress-$(CONFIG_KERNEL_LZ4) = lz4 compress-$(CONFIG_KERNEL_LZ4) = lz4_with_size
libfdt_objs := fdt_rw.o fdt_ro.o fdt_wip.o fdt.o libfdt_objs := fdt_rw.o fdt_ro.o fdt_wip.o fdt.o

View File

@@ -6,5 +6,6 @@
void kvm_init_hyp_services(void); void kvm_init_hyp_services(void);
bool kvm_arm_hyp_service_available(u32 func_id); bool kvm_arm_hyp_service_available(u32 func_id);
void kvm_arm_init_hyp_services(void);
#endif #endif

View File

@@ -62,14 +62,8 @@ user_backtrace(struct frame_tail __user *tail,
void void
perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
{ {
struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
struct frame_tail __user *tail; struct frame_tail __user *tail;
if (guest_cbs && guest_cbs->is_in_guest()) {
/* We don't support guest os callchain now */
return;
}
perf_callchain_store(entry, regs->ARM_pc); perf_callchain_store(entry, regs->ARM_pc);
if (!current->mm) if (!current->mm)
@@ -99,44 +93,25 @@ callchain_trace(struct stackframe *fr,
void void
perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
{ {
struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
struct stackframe fr; struct stackframe fr;
if (guest_cbs && guest_cbs->is_in_guest()) {
/* We don't support guest os callchain now */
return;
}
arm_get_current_stackframe(regs, &fr); arm_get_current_stackframe(regs, &fr);
walk_stackframe(&fr, callchain_trace, entry); walk_stackframe(&fr, callchain_trace, entry);
} }
unsigned long perf_instruction_pointer(struct pt_regs *regs) unsigned long perf_instruction_pointer(struct pt_regs *regs)
{ {
struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
if (guest_cbs && guest_cbs->is_in_guest())
return guest_cbs->get_guest_ip();
return instruction_pointer(regs); return instruction_pointer(regs);
} }
unsigned long perf_misc_flags(struct pt_regs *regs) unsigned long perf_misc_flags(struct pt_regs *regs)
{ {
struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs();
int misc = 0; int misc = 0;
if (guest_cbs && guest_cbs->is_in_guest()) {
if (guest_cbs->is_user_mode())
misc |= PERF_RECORD_MISC_GUEST_USER;
else
misc |= PERF_RECORD_MISC_GUEST_KERNEL;
} else {
if (user_mode(regs)) if (user_mode(regs))
misc |= PERF_RECORD_MISC_USER; misc |= PERF_RECORD_MISC_USER;
else else
misc |= PERF_RECORD_MISC_KERNEL; misc |= PERF_RECORD_MISC_KERNEL;
}
return misc; return misc;
} }

View File

@@ -51,6 +51,10 @@
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include <trace/events/ipi.h> #include <trace/events/ipi.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_raise);
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_entry);
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_exit);
/* /*
* as from 2.5, kernels no longer have an init_tasks structure * as from 2.5, kernels no longer have an init_tasks structure
* so we need some other way of telling a new secondary core * so we need some other way of telling a new secondary core
@@ -727,7 +731,12 @@ void __init set_smp_ipi_range(int ipi_base, int n)
WARN_ON(err); WARN_ON(err);
ipi_desc[i] = irq_to_desc(ipi_base + i); ipi_desc[i] = irq_to_desc(ipi_base + i);
if (i != IPI_RESCHEDULE)
irq_set_status_flags(ipi_base + i, IRQ_HIDDEN); irq_set_status_flags(ipi_base + i, IRQ_HIDDEN);
else
/* The recheduling IPI is special... */
irq_set_status_flags(ipi_base + i, IRQ_HIDDEN|IRQ_RAW);
} }
ipi_irq_base = ipi_base; ipi_irq_base = ipi_base;

View File

@@ -32,7 +32,7 @@ pmd_t tmp_pmd_table[PTRS_PER_PMD] __page_aligned_bss;
static __init void *kasan_alloc_block(size_t size) static __init void *kasan_alloc_block(size_t size)
{ {
return memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), return memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS),
MEMBLOCK_ALLOC_KASAN, NUMA_NO_NODE); MEMBLOCK_ALLOC_NOLEAKTRACE, NUMA_NO_NODE);
} }
static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,

View File

@@ -10,6 +10,7 @@ config ARM64
select ACPI_SPCR_TABLE if ACPI select ACPI_SPCR_TABLE if ACPI
select ACPI_PPTT if ACPI select ACPI_PPTT if ACPI
select ARCH_HAS_DEBUG_WX select ARCH_HAS_DEBUG_WX
select ARCH_BINFMT_ELF_EXTRA_PHDRS
select ARCH_BINFMT_ELF_STATE select ARCH_BINFMT_ELF_STATE
select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTPLUG
@@ -25,9 +26,12 @@ config ARM64
select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_IOREMAP_PHYS_HOOKS
select ARCH_HAS_KCOV select ARCH_HAS_KCOV
select ARCH_HAS_KEEPINITRD select ARCH_HAS_KEEPINITRD
select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEM_RELINQUISH
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_DEVMAP
select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_PTE_SPECIAL
@@ -45,6 +49,7 @@ config ARM64
select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_ELF_PROT select ARCH_HAVE_ELF_PROT
select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_HAVE_TRACE_MMIO_ACCESS
select ARCH_INLINE_READ_LOCK if !PREEMPTION select ARCH_INLINE_READ_LOCK if !PREEMPTION
select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION
select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION
@@ -122,6 +127,7 @@ config ARM64
select GENERIC_FIND_FIRST_BIT select GENERIC_FIND_FIRST_BIT
select GENERIC_IDLE_POLL_SETUP select GENERIC_IDLE_POLL_SETUP
select GENERIC_IRQ_IPI select GENERIC_IRQ_IPI
select ARCH_WANTS_IRQ_RAW
select GENERIC_IRQ_PROBE select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL select GENERIC_IRQ_SHOW_LEVEL
@@ -135,6 +141,7 @@ config ARM64
select GENERIC_VDSO_TIME_NS select GENERIC_VDSO_TIME_NS
select HANDLE_DOMAIN_IRQ select HANDLE_DOMAIN_IRQ
select HARDIRQS_SW_RESEND select HARDIRQS_SW_RESEND
select HAVE_MOD_ARCH_SPECIFIC if (ARM64_MODULE_PLTS || KVM)
select HAVE_MOVE_PMD select HAVE_MOVE_PMD
select HAVE_MOVE_PUD select HAVE_MOVE_PUD
select HAVE_PCI select HAVE_PCI
@@ -185,6 +192,7 @@ config ARM64
select HAVE_GCC_PLUGINS select HAVE_GCC_PLUGINS
select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_HW_BREAKPOINT if PERF_EVENTS
select HAVE_IRQ_TIME_ACCOUNTING select HAVE_IRQ_TIME_ACCOUNTING
select HAVE_KVM
select HAVE_NMI select HAVE_NMI
select HAVE_PATA_PLATFORM select HAVE_PATA_PLATFORM
select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS
@@ -203,7 +211,7 @@ config ARM64
select IOMMU_DMA if IOMMU_SUPPORT select IOMMU_DMA if IOMMU_SUPPORT
select IRQ_DOMAIN select IRQ_DOMAIN
select IRQ_FORCED_THREADING select IRQ_FORCED_THREADING
select KASAN_VMALLOC if KASAN_GENERIC select KASAN_VMALLOC if KASAN
select MODULES_USE_ELF_RELA select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH select NEED_SG_DMA_LENGTH
@@ -221,6 +229,7 @@ config ARM64
select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
select TRACE_IRQFLAGS_SUPPORT select TRACE_IRQFLAGS_SUPPORT
select TRACE_IRQFLAGS_NMI_SUPPORT select TRACE_IRQFLAGS_NMI_SUPPORT
select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
help help
ARM 64-bit (AArch64) Linux support. ARM 64-bit (AArch64) Linux support.
@@ -699,6 +708,130 @@ config ARM64_ERRATUM_1508412
If unsure, say Y. If unsure, say Y.
config ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE
bool
config ARM64_ERRATUM_2658417
bool "Cortex-A510: 2658417: remove BF16 support due to incorrect result"
default y
help
This option adds the workaround for ARM Cortex-A510 erratum 2658417.
Affected Cortex-A510 (r0p0 to r1p1) may produce the wrong result for
BFMMLA or VMMLA instructions in rare circumstances when a pair of
A510 CPUs are using shared neon hardware. As the sharing is not
discoverable by the kernel, hide the BF16 HWCAP to indicate that
user-space should not be using these instructions.
If unsure, say Y.
config ARM64_ERRATUM_2119858
bool "Cortex-A710: 2119858: workaround TRBE overwriting trace data in FILL mode"
default y
depends on CORESIGHT_TRBE
select ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE
help
This option adds the workaround for ARM Cortex-A710 erratum 2119858.
Affected Cortex-A710 cores could overwrite up to 3 cache lines of trace
data at the base of the buffer (pointed to by TRBASER_EL1) in FILL mode in
the event of a WRAP event.
Work around the issue by always making sure we move the TRBPTR_EL1 by
256 bytes before enabling the buffer and filling the first 256 bytes of
the buffer with ETM ignore packets upon disabling.
If unsure, say Y.
config ARM64_ERRATUM_2139208
bool "Neoverse-N2: 2139208: workaround TRBE overwriting trace data in FILL mode"
default y
depends on CORESIGHT_TRBE
select ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE
help
This option adds the workaround for ARM Neoverse-N2 erratum 2139208.
Affected Neoverse-N2 cores could overwrite up to 3 cache lines of trace
data at the base of the buffer (pointed to by TRBASER_EL1) in FILL mode in
the event of a WRAP event.
Work around the issue by always making sure we move the TRBPTR_EL1 by
256 bytes before enabling the buffer and filling the first 256 bytes of
the buffer with ETM ignore packets upon disabling.
If unsure, say Y.
config ARM64_WORKAROUND_TSB_FLUSH_FAILURE
bool
config ARM64_ERRATUM_2054223
bool "Cortex-A710: 2054223: workaround TSB instruction failing to flush trace"
default y
select ARM64_WORKAROUND_TSB_FLUSH_FAILURE
help
Enable workaround for ARM Cortex-A710 erratum 2054223
Affected cores may fail to flush the trace data on a TSB instruction, when
the PE is in trace prohibited state. This will cause losing a few bytes
of the trace cached.
Workaround is to issue two TSB consecutively on affected cores.
If unsure, say Y.
config ARM64_ERRATUM_2067961
bool "Neoverse-N2: 2067961: workaround TSB instruction failing to flush trace"
default y
select ARM64_WORKAROUND_TSB_FLUSH_FAILURE
help
Enable workaround for ARM Neoverse-N2 erratum 2067961
Affected cores may fail to flush the trace data on a TSB instruction, when
the PE is in trace prohibited state. This will cause losing a few bytes
of the trace cached.
Workaround is to issue two TSB consecutively on affected cores.
If unsure, say Y.
config ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE
bool
config ARM64_ERRATUM_2253138
bool "Neoverse-N2: 2253138: workaround TRBE writing to address out-of-range"
depends on CORESIGHT_TRBE
default y
select ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE
help
This option adds the workaround for ARM Neoverse-N2 erratum 2253138.
Affected Neoverse-N2 cores might write to an out-of-range address, not reserved
for TRBE. Under some conditions, the TRBE might generate a write to the next
virtually addressed page following the last page of the TRBE address space
(i.e., the TRBLIMITR_EL1.LIMIT), instead of wrapping around to the base.
Work around this in the driver by always making sure that there is a
page beyond the TRBLIMITR_EL1.LIMIT, within the space allowed for the TRBE.
If unsure, say Y.
config ARM64_ERRATUM_2224489
bool "Cortex-A710: 2224489: workaround TRBE writing to address out-of-range"
depends on CORESIGHT_TRBE
default y
select ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE
help
This option adds the workaround for ARM Cortex-A710 erratum 2224489.
Affected Cortex-A710 cores might write to an out-of-range address, not reserved
for TRBE. Under some conditions, the TRBE might generate a write to the next
virtually addressed page following the last page of the TRBE address space
(i.e., the TRBLIMITR_EL1.LIMIT), instead of wrapping around to the base.
Work around this in the driver by always making sure that there is a
page beyond the TRBLIMITR_EL1.LIMIT, within the space allowed for the TRBE.
If unsure, say Y.
config ARM64_ERRATUM_2441009 config ARM64_ERRATUM_2441009
bool "Cortex-A510: Completion of affected memory accesses might not be guaranteed by completion of a TLBI" bool "Cortex-A510: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"
default y default y
@@ -1639,6 +1772,21 @@ config ARM64_TLB_RANGE
The feature introduces new assembly instructions, and they were The feature introduces new assembly instructions, and they were
support when binutils >= 2.30. support when binutils >= 2.30.
config ARM64_MPAM
bool "Enable support for MPAM"
help
Memory Partitioning and Monitoring is an optional extension
that allows the CPUs to mark load and store transactions with
labels for partition-id and performance-monitoring-group.
System components, such as the caches, can use the partition-id
to apply a performance policy. MPAM monitors can use the
partition-id and performance-monitoring-group to measure the
cache occupancy or data throughput.
Use of this extension requires CPU support, support in the
memory system components (MSC), and a description from firmware
of where the MSC are in the address space.
endmenu endmenu
menu "ARMv8.5 architectural features" menu "ARMv8.5 architectural features"
@@ -1727,6 +1875,7 @@ config ARM64_MTE
depends on AS_HAS_LSE_ATOMICS depends on AS_HAS_LSE_ATOMICS
# Required for tag checking in the uaccess routines # Required for tag checking in the uaccess routines
depends on ARM64_PAN depends on ARM64_PAN
select ARCH_HAS_SUBPAGE_FAULTS
select ARCH_USES_HIGH_VMA_FLAGS select ARCH_USES_HIGH_VMA_FLAGS
help help
Memory Tagging (part of the ARMv8.5 Extensions) provides Memory Tagging (part of the ARMv8.5 Extensions) provides
@@ -1798,7 +1947,6 @@ config ARM64_SVE
config ARM64_MODULE_PLTS config ARM64_MODULE_PLTS
bool "Use PLTs to allow module memory to spill over into vmalloc area" bool "Use PLTs to allow module memory to spill over into vmalloc area"
depends on MODULES depends on MODULES
select HAVE_MOD_ARCH_SPECIFIC
help help
Allocate PLTs when loading modules so that jumps and calls whose Allocate PLTs when loading modules so that jumps and calls whose
targets are too far away for their relative offsets to be encoded targets are too far away for their relative offsets to be encoded
@@ -1932,6 +2080,12 @@ config CMDLINE_FROM_BOOTLOADER
the boot loader doesn't provide any, the default kernel command the boot loader doesn't provide any, the default kernel command
string provided in CMDLINE will be used. string provided in CMDLINE will be used.
config CMDLINE_EXTEND
bool "Extend bootloader kernel arguments"
help
The command-line arguments provided by the boot loader will be
appended to the default kernel command string.
config CMDLINE_FORCE config CMDLINE_FORCE
bool "Always use the default kernel command string" bool "Always use the default kernel command string"
help help

View File

@@ -167,7 +167,6 @@ config ARCH_MEDIATEK
config ARCH_MESON config ARCH_MESON
bool "Amlogic Platforms" bool "Amlogic Platforms"
select COMMON_CLK select COMMON_CLK
select MESON_IRQ_GPIO
help help
This enables support for the arm64 based Amlogic SoCs This enables support for the arm64 based Amlogic SoCs
such as the s905, S905X/D, S912, A113X/D or S905X/D2 such as the s905, S905X/D, S912, A113X/D or S905X/D2

View File

@@ -148,7 +148,10 @@ libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
boot := arch/arm64/boot boot := arch/arm64/boot
KBUILD_IMAGE := $(boot)/Image.gz KBUILD_IMAGE := $(boot)/Image.gz
# Don't compile Image in mixed build with "all" target
ifndef KBUILD_MIXED_TREE
all: Image.gz all: Image.gz
endif
Image: vmlinux Image: vmlinux
@@ -189,6 +192,11 @@ archclean:
$(Q)$(MAKE) $(clean)=arch/arm64/kernel/vdso $(Q)$(MAKE) $(clean)=arch/arm64/kernel/vdso
$(Q)$(MAKE) $(clean)=arch/arm64/kernel/vdso32 $(Q)$(MAKE) $(clean)=arch/arm64/kernel/vdso32
ifeq ($(CONFIG_KVM),y)
archscripts:
$(Q)$(MAKE) $(build)=arch/arm64/tools gen-hyprel
endif
ifeq ($(KBUILD_EXTMOD),) ifeq ($(KBUILD_EXTMOD),)
# We need to generate vdso-offsets.h before compiling certain files in kernel/. # We need to generate vdso-offsets.h before compiling certain files in kernel/.
# In order to do that, we should use the archprepare target, but we can't since # In order to do that, we should use the archprepare target, but we can't since

View File

@@ -0,0 +1,49 @@
# SPDX-License-Identifier: GPL-2.0
#
# This file is included by the generic Kbuild makefile to permit the
# architecture to perform postlink actions on vmlinux and any .ko module file.
# In this case, we only need it for fips140.ko, which needs some postprocessing
# for the integrity check mandated by FIPS. This involves making copies of the
# relocation sections so that the module will have access to them at
# initialization time, and calculating and injecting a HMAC digest into the
# module. All other targets are NOPs.
#
PHONY := __archpost
__archpost:
-include include/config/auto.conf
include scripts/Kbuild.include
CMD_FIPS140_GEN_HMAC = crypto/fips140_gen_hmac
quiet_cmd_gen_hmac = HMAC $@
cmd_gen_hmac = $(OBJCOPY) $@ \
--dump-section=$(shell $(READELF) -SW $@|grep -Eo '\.rela\.text\S*')=$@.rela.text \
--dump-section=$(shell $(READELF) -SW $@|grep -Eo '\.rela\.rodata\S*')=$@.rela.rodata && \
$(OBJCOPY) $@ \
--add-section=.init.rela.text=$@.rela.text \
--add-section=.init.rela.rodata=$@.rela.rodata \
--set-section-flags=.init.rela.text=alloc,readonly \
--set-section-flags=.init.rela.rodata=alloc,readonly && \
$(CMD_FIPS140_GEN_HMAC) $@
# `@true` prevents complaints when there is nothing to be done
vmlinux: FORCE
@true
$(objtree)/crypto/fips140.ko: FORCE
$(call cmd,gen_hmac)
%.ko: FORCE
@true
clean:
rm -f $(objtree)/crypto/fips140.ko.rela.*
PHONY += FORCE clean
FORCE:
.PHONY: $(PHONY)

4
arch/arm64/OWNERS Normal file
View File

@@ -0,0 +1,4 @@
per-file crypto/**=file:/crypto/OWNERS
per-file {include,kernel,kvm,lib}/**=mzyngier@google.com,willdeacon@google.com
per-file mm/**=file:/mm/OWNERS
per-file net/**=file:/net/OWNERS

View File

@@ -1,12 +1,14 @@
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
dtb-$(CONFIG_ARCH_MESON) += meson-axg-s400.dtb dtb-$(CONFIG_ARCH_MESON) += meson-axg-s400.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-g12a-sei510.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12a-sei510.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-g12a-sei510-android.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-g12a-u200.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12a-u200.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-g12a-x96-max.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12a-x96-max.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-g12b-gsking-x.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-gsking-x.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-g12b-gtking.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-gtking.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-g12b-gtking-pro.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-gtking-pro.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-g12b-a311d-khadas-vim3.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-a311d-khadas-vim3.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-g12b-a311d-khadas-vim3-android.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-g12b-s922x-khadas-vim3.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-s922x-khadas-vim3.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-g12b-odroid-n2.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-odroid-n2.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-g12b-odroid-n2-plus.dtb dtb-$(CONFIG_ARCH_MESON) += meson-g12b-odroid-n2-plus.dtb
@@ -50,7 +52,9 @@ dtb-$(CONFIG_ARCH_MESON) += meson-gxm-vega-s96.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-gxm-wetek-core2.dtb dtb-$(CONFIG_ARCH_MESON) += meson-gxm-wetek-core2.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-sm1-bananapi-m5.dtb dtb-$(CONFIG_ARCH_MESON) += meson-sm1-bananapi-m5.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-sm1-khadas-vim3l.dtb dtb-$(CONFIG_ARCH_MESON) += meson-sm1-khadas-vim3l.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-sm1-khadas-vim3l-android.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-sm1-odroid-c4.dtb dtb-$(CONFIG_ARCH_MESON) += meson-sm1-odroid-c4.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-sm1-odroid-hc4.dtb dtb-$(CONFIG_ARCH_MESON) += meson-sm1-odroid-hc4.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-sm1-sei610.dtb dtb-$(CONFIG_ARCH_MESON) += meson-sm1-sei610.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-sm1-sei610-android.dtb
dtb-$(CONFIG_ARCH_MESON) += meson-a1-ad401.dtb dtb-$(CONFIG_ARCH_MESON) += meson-a1-ad401.dtb

View File

@@ -0,0 +1,58 @@
// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
/*
* Copyright (c) 2020 BayLibre SAS. All rights reserved.
*/
/dts-v1/;
/plugin/;
#include <dt-bindings/gpio/gpio.h>
#include <dt-bindings/gpio/meson-g12a-gpio.h>
#include <dt-bindings/input/input.h>
#include <dt-bindings/interrupt-controller/irq.h>
#include <dt-bindings/interrupt-controller/arm-gic.h>
/ {
compatible = "seirobotics,sei510", "amlogic,g12a";
model = "SEI Robotics SEI510";
fragment@101 {
target-path = "/";
__overlay__ {
reserved-memory {
#address-cells = <2>;
#size-cells = <2>;
ramoops@d000000 {
compatible = "ramoops";
reg = <0x0 0x0d000000 0x0 0x00100000>;
record-size = <0x8000>;
console-size = <0x8000>;
ftrace-size = <0x0>;
pmsg-size = <0x8000>;
};
};
adc_keys {
button-onoff {
linux,code = <BTN_0>;
};
};
cvbs-connector {
status = "disabled";
};
};
};
};
&vddao_3v3_t {
gpio-open-drain;
};
&uart_A {
bluetooth {
interrupt-parent = <&gpio_intc>;
interrupts = <95 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "host-wakeup";
};
};

View File

@@ -0,0 +1,55 @@
// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
/*
* Copyright (c) 2019 BayLibre SAS. All rights reserved.
*/
/dts-v1/;
/plugin/;
#include <dt-bindings/phy/phy.h>
#include <dt-bindings/gpio/gpio.h>
#include <dt-bindings/gpio/meson-g12a-gpio.h>
#include <dt-bindings/input/input.h>
#include <dt-bindings/interrupt-controller/irq.h>
/ {
compatible = "khadas,vim3", "amlogic,a311d", "amlogic,g12b";
model = "Khadas VIM3";
fragment@101 {
target-path = "/";
__overlay__ {
reserved-memory {
#address-cells = <2>;
#size-cells = <2>;
ramoops@d000000 {
compatible = "ramoops";
reg = <0x0 0x0d000000 0x0 0x00100000>;
record-size = <0x8000>;
console-size = <0x8000>;
ftrace-size = <0x0>;
pmsg-size = <0x8000>;
};
};
};
};
};
&vcc_5v {
gpio-open-drain;
};
&uart_C {
status = "okay";
pinctrl-0 = <&uart_c_pins>;
pinctrl-names = "default";
};
&emmc_pwrseq{
status = "okay";
};
&sd_emmc_a {
/* WiFi firmware requires power to be kept while in suspend */
keep-power-in-suspend;
};

View File

@@ -0,0 +1,133 @@
// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
/*
* Copyright (c) 2019 BayLibre SAS. All rights reserved.
*/
/dts-v1/;
/plugin/;
#include <dt-bindings/phy/phy.h>
#include <dt-bindings/gpio/gpio.h>
#include <dt-bindings/gpio/meson-g12a-gpio.h>
#include <dt-bindings/input/input.h>
#include <dt-bindings/interrupt-controller/irq.h>
/ {
compatible = "khadas,vim3l", "amlogic,sm1";
model = "Khadas VIM3L";
fragment@101 {
target-path = "/";
__overlay__ {
reserved-memory {
#address-cells = <2>;
#size-cells = <2>;
ramoops@d000000 {
compatible = "ramoops";
reg = <0x0 0x0d000000 0x0 0x00100000>;
record-size = <0x8000>;
console-size = <0x8000>;
ftrace-size = <0x0>;
pmsg-size = <0x8000>;
};
};
};
};
};
&vcc_5v {
gpio-open-drain;
};
&uart_A {
bluetooth {
interrupt-parent = <&gpio_intc>;
interrupts = <95 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "host-wakeup";
};
};
&uart_C {
status = "disabled";
pinctrl-0 = <&uart_c_pins>;
pinctrl-names = "default";
};
&emmc_pwrseq{
status = "okay";
};
&sd_emmc_a {
/* WiFi firmware requires power to be kept while in suspend */
keep-power-in-suspend;
};
&spicc1 {
status = "okay";
pinctrl-names = "default";
pinctrl-0 = <&spicc1_pins>;
cs-gpios = <&gpio GPIOH_6 GPIO_ACTIVE_LOW>;
#address-cells = <1>;
#size-cells = <0>;
spidev@0 {
compatible = "rohm,dh2228fv";
reg = <0>;
spi-max-frequency = <500000>;
status = "okay";
};
neonkey@0 {
compatible = "nanohub";
reg = <0>;
spi-max-frequency = <500000>;
sensorhub,nreset-gpio = <&gpio GPIOA_0 0>;
sensorhub,boot0-gpio = <&gpio GPIOA_3 0>; /* Fake */
sensorhub,wakeup-gpio = <&gpio GPIOA_2 0>; /* A2 -> PB9 */
sensorhub,irq1-gpio = <&gpio GPIOA_1 0>; /* A1 -> PB5 */
interrupt-parent = <&gpio_intc>;
interrupts = <62 IRQ_TYPE_EDGE_RISING>; /* A1 */
/* sensorhub,spi-cs-gpio = <&gpio GPIOH_6 GPIO_ACTIVE_LOW>; Optional */
sensorhub,bl-addr = <0x08000000>;
sensorhub,kernel-addr = <0x0800C000>;
sensorhub,shared-addr = <0x08040000>;
sensorhub,flash-banks = <0 0x08000000 0x04000>,
<3 0x0800C000 0x04000>,
<4 0x08010000 0x10000>,
<5 0x08020000 0x20000>,
<6 0x08040000 0x20000>,
<7 0x08060000 0x20000>;
sensorhub,num-flash-banks = <6>;
status = "disabled";
};
argonkey@0 {
compatible = "nanohub";
reg = <0>;
spi-max-frequency = <500000>;
spi-cpol;
sensorhub,nreset-gpio = <&gpio GPIOA_0 0>;
sensorhub,boot0-gpio = <&gpio GPIOA_3 0>;
sensorhub,wakeup-gpio = <&gpio GPIOA_1 0>; /* A1 -> PA0 */
sensorhub,irq1-gpio = <&gpio GPIOA_2 0>; /* A2 -> PA1 */
interrupt-parent = <&gpio_intc>;
interrupts = <63 IRQ_TYPE_EDGE_RISING>; /* A2 */
sensorhub,bl-addr = <0x08000000>;
sensorhub,kernel-addr = <0x0800C000>;
sensorhub,num-flash-banks = <4>;
sensorhub,flash-banks = <0 0x08000000 0x04000>,
<3 0x0800C000 0x04000>,
<4 0x08010000 0x10000>,
<5 0x08020000 0x20000>;
sensorhub,shared-addr = <0x08040000>;
sensorhub,num-shared-flash-banks = <6>;
sensorhub,shared-flash-banks = <6 0x08040000 0x20000>,
<7 0x08060000 0x20000>,
<8 0x08080000 0x20000>,
<9 0x080A0000 0x20000>,
<10 0x080C0000 0x20000>,
<11 0x080E0000 0x20000>;
status = "disabled";
};
};

View File

@@ -0,0 +1,71 @@
// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
/*
* Copyright (c) 2020 BayLibre SAS. All rights reserved.
*/
/dts-v1/;
/plugin/;
#include <dt-bindings/gpio/gpio.h>
#include <dt-bindings/gpio/meson-g12a-gpio.h>
#include <dt-bindings/input/input.h>
#include <dt-bindings/interrupt-controller/irq.h>
#include <dt-bindings/interrupt-controller/arm-gic.h>
/ {
compatible = "seirobotics,sei610", "amlogic,sm1";
model = "SEI Robotics SEI610";
fragment@101 {
target-path = "/";
__overlay__ {
reserved-memory {
#address-cells = <2>;
#size-cells = <2>;
ramoops@d000000 {
compatible = "ramoops";
reg = <0x0 0x0d000000 0x0 0x00100000>;
record-size = <0x8000>;
console-size = <0x8000>;
ftrace-size = <0x0>;
pmsg-size = <0x8000>;
};
};
};
};
};
&vddao_3v3_t {
gpio-open-drain;
};
&emmc_pwrseq {
status = "okay";
};
&sd_emmc_a {
/* WiFi firmware requires power to be kept while in suspend */
keep-power-in-suspend;
};
&uart_C {
status = "disabled";
pinctrl-0 = <&uart_c_pins>;
pinctrl-names = "default";
};
&spicc0 {
status = "disabled";
pinctrl-names = "default";
pinctrl-0 = <&spicc0_x_pins>;
cs-gpios = <&gpio GPIOX_10 GPIO_ACTIVE_LOW>;
#address-cells = <1>;
#size-cells = <0>;
spidev@0 {
compatible = "rohm,dh2228fv";
reg = <0>;
spi-max-frequency = <500000>;
status = "disabled";
};
};

View File

@@ -16,6 +16,8 @@
/ { / {
model = "Qualcomm Technologies, Inc. Robotics RB5"; model = "Qualcomm Technologies, Inc. Robotics RB5";
compatible = "qcom,qrb5165-rb5", "qcom,sm8250"; compatible = "qcom,qrb5165-rb5", "qcom,sm8250";
qcom,msm-id = <455 0x20001>;
qcom,board-id = <11 3>;
aliases { aliases {
serial0 = &uart12; serial0 = &uart12;

View File

@@ -17,6 +17,8 @@
/ { / {
model = "Thundercomm Dragonboard 845c"; model = "Thundercomm Dragonboard 845c";
compatible = "thundercomm,db845c", "qcom,sdm845"; compatible = "thundercomm,db845c", "qcom,sdm845";
qcom,msm-id = <341 0x20001>;
qcom,board-id = <8 0>;
aliases { aliases {
serial0 = &uart9; serial0 = &uart9;

View File

@@ -31,7 +31,7 @@
}; };
&uart0 { &uart0 {
status = "okay"; status = "disabled";
bluetooth { bluetooth {
compatible = "brcm,bcm43438-bt"; compatible = "brcm,bcm43438-bt";

View File

@@ -0,0 +1,3 @@
CONFIG_ARM64_16K_PAGES=y
# b/241785095
# CONFIG_INCREMENTAL_FS is not set

Some files were not shown because too many files have changed in this diff Show More